]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Flow.hs
Merge remote-tracking branch 'origin/151-dev-pubmed-api-key' into dev
[gargantext.git] / src / Gargantext / Database / Action / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -- TODO-ACCESS:
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
13
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
16 -}
17
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
19
20 {-# LANGUAGE ConstrainedClassMethods #-}
21 {-# LANGUAGE ConstraintKinds #-}
22 {-# LANGUAGE InstanceSigs #-}
23 {-# LANGUAGE ScopedTypeVariables #-}
24 {-# LANGUAGE TemplateHaskell #-}
25
26 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
27 ( DataText(..)
28 , getDataText
29 , getDataText_Debug
30 , flowDataText
31 , flow
32
33 , flowCorpusFile
34 , flowCorpus
35 , flowCorpusUser
36 , flowAnnuaire
37 , insertMasterDocs
38 , saveDocNgramsWith
39
40 , getOrMkRoot
41 , getOrMk_RootWithCorpus
42 , TermType(..)
43 , DataOrigin(..)
44 , allDataOrigins
45
46 , do_api
47 , indexAllDocumentsWithPosTag
48 )
49 where
50
51 import Conduit
52 import Control.Lens ((^.), view, _Just, makeLenses, over, traverse)
53 import Control.Monad.Reader (MonadReader)
54 import Data.Aeson.TH (deriveJSON)
55 import Data.Conduit.Internal (zipSources)
56 import Data.Either
57 import Data.HashMap.Strict (HashMap)
58 import Data.Hashable (Hashable)
59 import Data.List (concat)
60 import Data.Map (Map, lookup)
61 import Data.Maybe (catMaybes)
62 import Data.Monoid
63 import Data.Swagger
64 import qualified Data.Text as T
65 import Data.Tuple.Extra (first, second)
66 import GHC.Generics (Generic)
67 import Servant.Client (ClientError)
68 import System.FilePath (FilePath)
69 import qualified Data.HashMap.Strict as HashMap
70 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
71 import qualified Data.Map as Map
72 import qualified Data.Conduit.List as CL
73 import qualified Data.Conduit as C
74
75 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..))
76 import Gargantext.Core (Lang(..), PosTagAlgo(..))
77 import Gargantext.Core.Ext.IMT (toSchoolName)
78 import Gargantext.Core.Ext.IMTUser (readFile_Annuaire)
79 import Gargantext.Core.Flow.Types
80 import Gargantext.Core.Text
81 import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat, FileType)
82 import Gargantext.Core.Text.List (buildNgramsLists)
83 import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
84 import Gargantext.Core.Text.List.Social (FlowSocialListWith)
85 import Gargantext.Core.Text.Terms
86 import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
87 import Gargantext.Core.Types (POS(NP), TermsCount)
88 import Gargantext.Core.Types.Individu (User(..))
89 import Gargantext.Core.Types.Main
90 import Gargantext.Core.Utils (addTuples)
91 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
92 import Gargantext.Database.Action.Flow.List
93 import Gargantext.Database.Action.Flow.Types
94 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams, DocumentIdWithNgrams(..))
95 import Gargantext.Database.Action.Search (searchDocInDatabase)
96 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
97 import Gargantext.Database.Action.Metrics (updateNgramsOccurrences)
98 import Gargantext.Database.Admin.Types.Hyperdata
99 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
100 import Gargantext.Database.Prelude
101 import Gargantext.Database.Query.Table.ContextNodeNgrams2
102 import Gargantext.Database.Query.Table.Ngrams
103 import Gargantext.Database.Query.Table.Node
104 import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
105 import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
106 import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
107 import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
108 import Gargantext.Database.Schema.Node (NodePoly(..), node_id)
109 import Gargantext.Database.Types
110 import Gargantext.Prelude
111 import Gargantext.Prelude.Crypto.Hash (Hash)
112 import qualified Gargantext.Core.Text.Corpus.API as API
113 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
114 import qualified Prelude
115
116 ------------------------------------------------------------------------
117 -- Imports for upgrade function
118 import Gargantext.Database.Query.Tree.Root (getRootId)
119 import Gargantext.Database.Query.Tree (findNodesId)
120 import qualified Data.List as List
121 ------------------------------------------------------------------------
122 -- TODO use internal with API name (could be old data)
123 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
124 | ExternalOrigin { _do_api :: API.ExternalAPIs }
125 -- TODO Web
126 deriving (Generic, Eq)
127
128 makeLenses ''DataOrigin
129 deriveJSON (unPrefix "_do_") ''DataOrigin
130 instance ToSchema DataOrigin where
131 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
132
133 allDataOrigins :: ( MonadReader env m
134 , HasConfig env) => m [DataOrigin]
135 allDataOrigins = do
136 ext <- API.externalAPIs
137
138 pure $ map InternalOrigin ext
139 <> map ExternalOrigin ext
140
141 ---------------
142 data DataText = DataOld ![NodeId]
143 | DataNew !(Maybe Integer, ConduitT () HyperdataDocument IO ())
144 --- | DataNew ![[HyperdataDocument]]
145
146 -- Show instance is not possible because of IO
147 printDataText :: DataText -> IO ()
148 printDataText (DataOld xs) = putStrLn $ show xs
149 printDataText (DataNew (maybeInt, conduitData)) = do
150 res <- C.runConduit (conduitData .| CL.consume)
151 putStrLn $ show (maybeInt, res)
152
153 -- TODO use the split parameter in config file
154 getDataText :: FlowCmdM env err m
155 => DataOrigin
156 -> TermType Lang
157 -> API.Query
158 -> Maybe API.Limit
159 -> m (Either ClientError DataText)
160 getDataText (ExternalOrigin api) la q li = liftBase $ do
161 eRes <- API.get api (_tt_lang la) q li
162 pure $ DataNew <$> eRes
163
164 getDataText (InternalOrigin _) _la q _li = do
165 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
166 (UserName userMaster)
167 (Left "")
168 (Nothing :: Maybe HyperdataCorpus)
169 ids <- map fst <$> searchDocInDatabase cId (stemIt q)
170 pure $ Right $ DataOld ids
171
172 getDataText_Debug :: FlowCmdM env err m
173 => DataOrigin
174 -> TermType Lang
175 -> API.Query
176 -> Maybe API.Limit
177 -> m ()
178 getDataText_Debug a l q li = do
179 result <- getDataText a l q li
180 case result of
181 Left err -> liftBase $ putStrLn $ show err
182 Right res -> liftBase $ printDataText res
183
184
185 -------------------------------------------------------------------------------
186 flowDataText :: forall env err m.
187 ( FlowCmdM env err m
188 )
189 => User
190 -> DataText
191 -> TermType Lang
192 -> CorpusId
193 -> Maybe FlowSocialListWith
194 -> (JobLog -> m ())
195 -> m CorpusId
196 flowDataText u (DataOld ids) tt cid mfslw _ = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids mfslw
197 where
198 corpusType = (Nothing :: Maybe HyperdataCorpus)
199 flowDataText u (DataNew (mLen, txtC)) tt cid mfslw logStatus =
200 flowCorpus u (Right [cid]) tt mfslw (mLen, (transPipe liftBase txtC)) logStatus
201
202 ------------------------------------------------------------------------
203 -- TODO use proxy
204 flowAnnuaire :: (FlowCmdM env err m)
205 => User
206 -> Either CorpusName [CorpusId]
207 -> (TermType Lang)
208 -> FilePath
209 -> (JobLog -> m ())
210 -> m AnnuaireId
211 flowAnnuaire u n l filePath logStatus = do
212 -- TODO Conduit for file
213 docs <- liftBase $ ((readFile_Annuaire filePath) :: IO [HyperdataContact])
214 flow (Nothing :: Maybe HyperdataAnnuaire) u n l Nothing (Just $ fromIntegral $ length docs, yieldMany docs) logStatus
215
216 ------------------------------------------------------------------------
217 flowCorpusFile :: (FlowCmdM env err m)
218 => User
219 -> Either CorpusName [CorpusId]
220 -> Limit -- Limit the number of docs (for dev purpose)
221 -> TermType Lang
222 -> FileType
223 -> FileFormat
224 -> FilePath
225 -> Maybe FlowSocialListWith
226 -> (JobLog -> m ())
227 -> m CorpusId
228 flowCorpusFile u n _l la ft ff fp mfslw logStatus = do
229 eParsed <- liftBase $ parseFile ft ff fp
230 case eParsed of
231 Right parsed -> do
232 flowCorpus u n la mfslw (Just $ fromIntegral $ length parsed, yieldMany parsed .| mapC toHyperdataDocument) logStatus
233 --let docs = splitEvery 500 $ take l parsed
234 --flowCorpus u n la mfslw (yieldMany $ map (map toHyperdataDocument) docs) logStatus
235 Left e -> panic $ "Error: " <> T.pack e
236
237 ------------------------------------------------------------------------
238 -- | TODO improve the needed type to create/update a corpus
239 -- (For now, Either is enough)
240 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
241 => User
242 -> Either CorpusName [CorpusId]
243 -> TermType Lang
244 -> Maybe FlowSocialListWith
245 -> (Maybe Integer, ConduitT () a m ())
246 -> (JobLog -> m ())
247 -> m CorpusId
248 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
249
250
251 flow :: forall env err m a c.
252 ( FlowCmdM env err m
253 , FlowCorpus a
254 , MkCorpus c
255 )
256 => Maybe c
257 -> User
258 -> Either CorpusName [CorpusId]
259 -> TermType Lang
260 -> Maybe FlowSocialListWith
261 -> (Maybe Integer, ConduitT () a m ())
262 -> (JobLog -> m ())
263 -> m CorpusId
264 flow c u cn la mfslw (mLength, docsC) logStatus = do
265 -- TODO if public insertMasterDocs else insertUserDocs
266 ids <- runConduit $ zipSources (yieldMany [1..]) docsC
267 .| mapMC insertDoc
268 .| sinkList
269 -- ids <- traverse (\(idx, doc) -> do
270 -- id <- insertMasterDocs c la doc
271 -- logStatus JobLog { _scst_succeeded = Just $ 1 + idx
272 -- , _scst_failed = Just 0
273 -- , _scst_remaining = Just $ length docs - idx
274 -- , _scst_events = Just []
275 -- }
276 -- pure id
277 -- ) (zip [1..] docs)
278 flowCorpusUser (la ^. tt_lang) u cn c ids mfslw
279
280 where
281 insertDoc :: (Integer, a) -> m NodeId
282 insertDoc (idx, doc) = do
283 id <- insertMasterDocs c la [doc]
284 case mLength of
285 Nothing -> pure ()
286 Just len -> do
287 logStatus JobLog { _scst_succeeded = Just $ fromIntegral $ 1 + idx
288 , _scst_failed = Just 0
289 , _scst_remaining = Just $ fromIntegral $ len - idx
290 , _scst_events = Just []
291 }
292 pure $ Prelude.head id
293
294
295
296 ------------------------------------------------------------------------
297 flowCorpusUser :: ( FlowCmdM env err m
298 , MkCorpus c
299 )
300 => Lang
301 -> User
302 -> Either CorpusName [CorpusId]
303 -> Maybe c
304 -> [ContextId]
305 -> Maybe FlowSocialListWith
306 -> m CorpusId
307 flowCorpusUser l user corpusName ctype ids mfslw = do
308 -- User Flow
309 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
310 -- NodeTexts is first
311 _tId <- insertDefaultNodeIfNotExists NodeTexts userCorpusId userId
312 -- printDebug "NodeTexts: " tId
313
314 -- NodeList is second
315 listId <- getOrMkList userCorpusId userId
316 -- _cooc <- insertDefaultNode NodeListCooc listId userId
317 -- TODO: check if present already, ignore
318 _ <- Doc.add userCorpusId ids
319
320 -- printDebug "Node Text Ids:" tId
321
322 -- User List Flow
323 (masterUserId, _masterRootId, masterCorpusId)
324 <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
325
326 --let gp = (GroupParams l 2 3 (StopSize 3))
327 -- Here the PosTagAlgo should be chosen according to the Lang
328 let gp = GroupWithPosTag l CoreNLP HashMap.empty
329 ngs <- buildNgramsLists user userCorpusId masterCorpusId mfslw gp
330
331 -- printDebug "flowCorpusUser:ngs" ngs
332
333 _userListId <- flowList_DbRepo listId ngs
334 _mastListId <- getOrMkList masterCorpusId masterUserId
335 -- _ <- insertOccsUpdates userCorpusId mastListId
336 -- printDebug "userListId" userListId
337 -- User Graph Flow
338 _ <- insertDefaultNodeIfNotExists NodeGraph userCorpusId userId
339 _ <- insertDefaultNodeIfNotExists NodeDashboard userCorpusId userId
340 --_ <- mkPhylo userCorpusId userId
341 -- Annuaire Flow
342 -- _ <- mkAnnuaire rootUserId userId
343 _ <- updateNgramsOccurrences userCorpusId (Just listId)
344
345 pure userCorpusId
346
347
348 insertMasterDocs :: ( FlowCmdM env err m
349 , FlowCorpus a
350 , MkCorpus c
351 )
352 => Maybe c
353 -> TermType Lang
354 -> [a]
355 -> m [DocId]
356 insertMasterDocs c lang hs = do
357 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
358 (ids', documentsWithId) <- insertDocs masterUserId masterCorpusId (map (toNode masterUserId Nothing) hs )
359 _ <- Doc.add masterCorpusId ids'
360 -- TODO
361 -- create a corpus with database name (CSV or PubMed)
362 -- add documents to the corpus (create node_node link)
363 -- this will enable global database monitoring
364
365 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
366 mapNgramsDocs' :: HashMap ExtractedNgrams (Map NgramsType (Map NodeId (Int, TermsCount)))
367 <- mapNodeIdNgrams
368 <$> documentIdWithNgrams
369 (extractNgramsT $ withLang lang documentsWithId)
370 documentsWithId
371
372 lId <- getOrMkList masterCorpusId masterUserId
373 -- _ <- saveDocNgramsWith lId mapNgramsDocs'
374 _ <- saveDocNgramsWith lId mapNgramsDocs'
375
376 -- _cooc <- insertDefaultNode NodeListCooc lId masterUserId
377 pure ids'
378
379 saveDocNgramsWith :: (FlowCmdM env err m)
380 => ListId
381 -> HashMap ExtractedNgrams (Map NgramsType (Map NodeId (Int, TermsCount)))
382 -> m ()
383 saveDocNgramsWith lId mapNgramsDocs' = do
384 --printDebug "[saveDocNgramsWith] mapNgramsDocs'" mapNgramsDocs'
385 let mapNgramsDocsNoCount = over (traverse . traverse . traverse) fst mapNgramsDocs'
386 terms2id <- insertExtractedNgrams $ HashMap.keys mapNgramsDocsNoCount
387
388 let mapNgramsDocs = HashMap.mapKeys extracted2ngrams mapNgramsDocs'
389
390 -- new
391 mapCgramsId <- listInsertDb lId toNodeNgramsW'
392 $ map (first _ngramsTerms . second Map.keys)
393 $ HashMap.toList mapNgramsDocs
394
395 --printDebug "saveDocNgramsWith" mapCgramsId
396 -- insertDocNgrams
397 _return <- insertContextNodeNgrams2
398 $ catMaybes [ ContextNodeNgrams2 <$> Just nId
399 <*> (getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms''))
400 <*> Just (fromIntegral w :: Double)
401 | (terms'', mapNgramsTypes) <- HashMap.toList mapNgramsDocs
402 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
403 , (nId, (w, _cnt)) <- Map.toList mapNodeIdWeight
404 ]
405
406 -- to be removed
407 _ <- insertDocNgrams lId $ HashMap.mapKeys (indexNgrams terms2id) mapNgramsDocs
408
409 pure ()
410
411
412 ------------------------------------------------------------------------
413 -- TODO Type NodeDocumentUnicised
414 insertDocs :: ( FlowCmdM env err m
415 -- , FlowCorpus a
416 , FlowInsertDB a
417 )
418 => UserId
419 -> CorpusId
420 -> [a]
421 -> m ([ContextId], [Indexed ContextId a])
422 insertDocs uId cId hs = do
423 let docs = map addUniqId hs
424 newIds <- insertDb uId Nothing docs
425 -- printDebug "newIds" newIds
426 let
427 newIds' = map reId newIds
428 documentsWithId = mergeData (toInserted newIds) (Map.fromList $ map viewUniqId' docs)
429 _ <- Doc.add cId newIds'
430 pure (newIds', documentsWithId)
431
432
433 ------------------------------------------------------------------------
434 viewUniqId' :: UniqId a
435 => a
436 -> (Hash, a)
437 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
438 where
439 err = panic "[ERROR] Database.Flow.toInsert"
440
441
442 toInserted :: [ReturnId]
443 -> Map Hash ReturnId
444 toInserted =
445 Map.fromList . map (\r -> (reUniqId r, r) )
446 . filter (\r -> reInserted r == True)
447
448 mergeData :: Map Hash ReturnId
449 -> Map Hash a
450 -> [Indexed NodeId a]
451 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
452 where
453 toDocumentWithId (sha,hpd) =
454 Indexed <$> fmap reId (lookup sha rs)
455 <*> Just hpd
456
457 ------------------------------------------------------------------------
458 ------------------------------------------------------------------------
459 ------------------------------------------------------------------------
460 documentIdWithNgrams :: HasNodeError err
461 => (a
462 -> Cmd err (HashMap b (Map NgramsType Int, TermsCount)))
463 -> [Indexed NodeId a]
464 -> Cmd err [DocumentIdWithNgrams a b]
465 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
466 where
467 toDocumentIdWithNgrams d = do
468 e <- f $ _unIndex d
469 pure $ DocumentIdWithNgrams d e
470
471
472 -- | TODO check optimization
473 mapNodeIdNgrams :: (Ord b, Hashable b)
474 => [DocumentIdWithNgrams a b]
475 -> HashMap b
476 (Map NgramsType
477 (Map NodeId (Int, TermsCount))
478 )
479 mapNodeIdNgrams = HashMap.unionsWith (Map.unionWith (Map.unionWith addTuples)) . fmap f
480 where
481 -- | NOTE We are somehow multiplying 'TermsCount' here: If the
482 -- same ngrams term has different ngrams types, the 'TermsCount'
483 -- for it (which is the number of times the terms appears in a
484 -- document) is copied over to all its types.
485 f :: DocumentIdWithNgrams a b
486 -> HashMap b (Map NgramsType (Map NodeId (Int, TermsCount)))
487 f d = fmap (\(ngramsTypeMap, cnt) -> fmap (\i -> Map.singleton nId (i, cnt)) ngramsTypeMap) $ documentNgrams d
488 where
489 nId = _index $ documentWithId d
490
491
492 ------------------------------------------------------------------------
493 instance ExtractNgramsT HyperdataContact
494 where
495 extractNgramsT l hc = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extract l hc
496 where
497 extract :: TermType Lang -> HyperdataContact
498 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int, TermsCount))
499 extract _l hc' = do
500 let authors = map text2ngrams
501 $ maybe ["Nothing"] (\a -> [a])
502 $ view (hc_who . _Just . cw_lastName) hc'
503
504 pure $ HashMap.fromList $ [(SimpleNgrams a', (Map.singleton Authors 1, 1)) | a' <- authors ]
505
506
507 instance ExtractNgramsT HyperdataDocument
508 where
509 extractNgramsT :: TermType Lang
510 -> HyperdataDocument
511 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int, TermsCount))
512 extractNgramsT lang hd = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extractNgramsT' lang hd
513 where
514 extractNgramsT' :: TermType Lang
515 -> HyperdataDocument
516 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int, TermsCount))
517 extractNgramsT' lang' doc = do
518 let source = text2ngrams
519 $ maybe "Nothing" identity
520 $ _hd_source doc
521
522 institutes = map text2ngrams
523 $ maybe ["Nothing"] (map toSchoolName . (T.splitOn ", "))
524 $ _hd_institutes doc
525
526 authors = map text2ngrams
527 $ maybe ["Nothing"] (T.splitOn ", ")
528 $ _hd_authors doc
529
530 termsWithCounts' <- map (\(t, cnt) -> (enrichedTerms (lang' ^. tt_lang) CoreNLP NP t, cnt))
531 <$> concat
532 <$> liftBase (extractTerms lang' $ hasText doc)
533
534 pure $ HashMap.fromList
535 $ [(SimpleNgrams source, (Map.singleton Sources 1, 1)) ]
536 <> [(SimpleNgrams i', (Map.singleton Institutes 1, 1)) | i' <- institutes ]
537 <> [(SimpleNgrams a', (Map.singleton Authors 1, 1)) | a' <- authors ]
538 <> [(EnrichedNgrams t', (Map.singleton NgramsTerms 1, cnt')) | (t', cnt') <- termsWithCounts' ]
539
540 instance (ExtractNgramsT a, HasText a) => ExtractNgramsT (Node a)
541 where
542 extractNgramsT l (Node { _node_hyperdata = h }) = extractNgramsT l h
543
544 instance HasText a => HasText (Node a)
545 where
546 hasText (Node { _node_hyperdata = h }) = hasText h
547
548
549
550 -- | TODO putelsewhere
551 -- | Upgrade function
552 -- Suppose all documents are English (this is the case actually)
553 indexAllDocumentsWithPosTag :: FlowCmdM env err m
554 => m ()
555 indexAllDocumentsWithPosTag = do
556 rootId <- getRootId (UserName userMaster)
557 corpusIds <- findNodesId rootId [NodeCorpus]
558 docs <- List.concat <$> mapM getDocumentsWithParentId corpusIds
559 _ <- mapM extractInsert (splitEvery 1000 docs)
560 pure ()
561
562 extractInsert :: FlowCmdM env err m
563 => [Node HyperdataDocument] -> m ()
564 extractInsert docs = do
565 let documentsWithId = map (\doc -> Indexed (doc ^. node_id) doc) docs
566 mapNgramsDocs' <- mapNodeIdNgrams
567 <$> documentIdWithNgrams
568 (extractNgramsT $ withLang (Multi EN) documentsWithId)
569 documentsWithId
570 _ <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
571 pure ()