]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Flow.hs
Merge remote-tracking branch 'origin/161-dev-conduit-insert-db-fix' into dev-merge
[gargantext.git] / src / Gargantext / Database / Action / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -- TODO-ACCESS:
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
13
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
16 -}
17
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
19
20 {-# LANGUAGE ConstrainedClassMethods #-}
21 {-# LANGUAGE ConstraintKinds #-}
22 {-# LANGUAGE InstanceSigs #-}
23 {-# LANGUAGE ScopedTypeVariables #-}
24 {-# LANGUAGE TemplateHaskell #-}
25
26 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
27 ( DataText(..)
28 , getDataText
29 , getDataText_Debug
30 , flowDataText
31 , flow
32
33 , flowCorpusFile
34 , flowCorpus
35 , flowCorpusUser
36 , flowAnnuaire
37 , insertMasterDocs
38 , saveDocNgramsWith
39
40 , getOrMkRoot
41 , getOrMk_RootWithCorpus
42 , TermType(..)
43 , DataOrigin(..)
44 , allDataOrigins
45
46 , do_api
47 , indexAllDocumentsWithPosTag
48 )
49 where
50
51 import Conduit
52 import Control.Lens ((^.), view, _Just, makeLenses, over, traverse)
53 import Control.Monad.Reader (MonadReader)
54 import Data.Aeson.TH (deriveJSON)
55 import Data.Conduit.Internal (zipSources)
56 import qualified Data.Conduit.List as CList
57 import Data.Either
58 import Data.HashMap.Strict (HashMap)
59 import Data.Hashable (Hashable)
60 import Data.List (concat)
61 import Data.Map (Map, lookup)
62 import Data.Maybe (catMaybes)
63 import Data.Monoid
64 import Data.Swagger
65 import qualified Data.Text as T
66 import Data.Tuple.Extra (first, second)
67 import GHC.Generics (Generic)
68 import Servant.Client (ClientError)
69 import System.FilePath (FilePath)
70 import qualified Data.HashMap.Strict as HashMap
71 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
72 import qualified Data.Map as Map
73 import qualified Data.Conduit.List as CL
74 import qualified Data.Conduit as C
75
76 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..))
77 import Gargantext.Core (Lang(..), PosTagAlgo(..))
78 import Gargantext.Core.Ext.IMT (toSchoolName)
79 import Gargantext.Core.Ext.IMTUser (readFile_Annuaire)
80 import Gargantext.Core.Flow.Types
81 import Gargantext.Core.Text
82 import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat, FileType)
83 import Gargantext.Core.Text.List (buildNgramsLists)
84 import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
85 import Gargantext.Core.Text.List.Social (FlowSocialListWith(..))
86 import Gargantext.Core.Text.Terms
87 import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
88 import Gargantext.Core.Types (POS(NP), TermsCount)
89 import Gargantext.Core.Types.Individu (User(..))
90 import Gargantext.Core.Types.Main
91 import Gargantext.Core.Utils (addTuples)
92 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
93 import Gargantext.Database.Action.Flow.List
94 import Gargantext.Database.Action.Flow.Types
95 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams, DocumentIdWithNgrams(..))
96 import Gargantext.Database.Action.Search (searchDocInDatabase)
97 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
98 import Gargantext.Database.Action.Metrics (updateNgramsOccurrences)
99 import Gargantext.Database.Admin.Types.Hyperdata
100 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
101 import Gargantext.Database.Prelude
102 import Gargantext.Database.Query.Table.ContextNodeNgrams2
103 import Gargantext.Database.Query.Table.Ngrams
104 import Gargantext.Database.Query.Table.Node
105 import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
106 import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
107 import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
108 import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
109 import Gargantext.Database.Schema.Node (NodePoly(..), node_id)
110 import Gargantext.Database.Types
111 import Gargantext.Prelude
112 import Gargantext.Prelude.Crypto.Hash (Hash)
113 import qualified Gargantext.Core.Text.Corpus.API as API
114 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
115 --import qualified Prelude
116
117 ------------------------------------------------------------------------
118 -- Imports for upgrade function
119 import Gargantext.Database.Query.Tree.Root (getRootId)
120 import Gargantext.Database.Query.Tree (findNodesId)
121 import qualified Data.List as List
122 ------------------------------------------------------------------------
123 -- TODO use internal with API name (could be old data)
124 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
125 | ExternalOrigin { _do_api :: API.ExternalAPIs }
126 -- TODO Web
127 deriving (Generic, Eq)
128
129 makeLenses ''DataOrigin
130 deriveJSON (unPrefix "_do_") ''DataOrigin
131 instance ToSchema DataOrigin where
132 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
133
134 allDataOrigins :: ( MonadReader env m
135 , HasConfig env) => m [DataOrigin]
136 allDataOrigins = do
137 ext <- API.externalAPIs
138
139 pure $ map InternalOrigin ext
140 <> map ExternalOrigin ext
141
142 ---------------
143 data DataText = DataOld ![NodeId]
144 | DataNew !(Maybe Integer, ConduitT () HyperdataDocument IO ())
145 --- | DataNew ![[HyperdataDocument]]
146
147 -- Show instance is not possible because of IO
148 printDataText :: DataText -> IO ()
149 printDataText (DataOld xs) = putStrLn $ show xs
150 printDataText (DataNew (maybeInt, conduitData)) = do
151 res <- C.runConduit (conduitData .| CL.consume)
152 putStrLn $ show (maybeInt, res)
153
154 -- TODO use the split parameter in config file
155 getDataText :: FlowCmdM env err m
156 => DataOrigin
157 -> TermType Lang
158 -> API.Query
159 -> Maybe API.Limit
160 -> m (Either ClientError DataText)
161 getDataText (ExternalOrigin api) la q li = liftBase $ do
162 eRes <- API.get api (_tt_lang la) q li
163 pure $ DataNew <$> eRes
164
165 getDataText (InternalOrigin _) _la q _li = do
166 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
167 (UserName userMaster)
168 (Left "")
169 (Nothing :: Maybe HyperdataCorpus)
170 ids <- map fst <$> searchDocInDatabase cId (stemIt q)
171 pure $ Right $ DataOld ids
172
173 getDataText_Debug :: FlowCmdM env err m
174 => DataOrigin
175 -> TermType Lang
176 -> API.Query
177 -> Maybe API.Limit
178 -> m ()
179 getDataText_Debug a l q li = do
180 result <- getDataText a l q li
181 case result of
182 Left err -> liftBase $ putStrLn $ show err
183 Right res -> liftBase $ printDataText res
184
185
186 -------------------------------------------------------------------------------
187 flowDataText :: forall env err m.
188 ( FlowCmdM env err m
189 )
190 => User
191 -> DataText
192 -> TermType Lang
193 -> CorpusId
194 -> Maybe FlowSocialListWith
195 -> (JobLog -> m ())
196 -> m CorpusId
197 flowDataText u (DataOld ids) tt cid mfslw _ = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids mfslw
198 where
199 corpusType = (Nothing :: Maybe HyperdataCorpus)
200 flowDataText u (DataNew (mLen, txtC)) tt cid mfslw logStatus =
201 flowCorpus u (Right [cid]) tt mfslw (mLen, (transPipe liftBase txtC)) logStatus
202
203 ------------------------------------------------------------------------
204 -- TODO use proxy
205 flowAnnuaire :: (FlowCmdM env err m)
206 => User
207 -> Either CorpusName [CorpusId]
208 -> (TermType Lang)
209 -> FilePath
210 -> (JobLog -> m ())
211 -> m AnnuaireId
212 flowAnnuaire u n l filePath logStatus = do
213 -- TODO Conduit for file
214 docs <- liftBase $ ((readFile_Annuaire filePath) :: IO [HyperdataContact])
215 flow (Nothing :: Maybe HyperdataAnnuaire) u n l Nothing (Just $ fromIntegral $ length docs, yieldMany docs) logStatus
216
217 ------------------------------------------------------------------------
218 flowCorpusFile :: (FlowCmdM env err m)
219 => User
220 -> Either CorpusName [CorpusId]
221 -> Limit -- Limit the number of docs (for dev purpose)
222 -> TermType Lang
223 -> FileType
224 -> FileFormat
225 -> FilePath
226 -> Maybe FlowSocialListWith
227 -> (JobLog -> m ())
228 -> m CorpusId
229 flowCorpusFile u n _l la ft ff fp mfslw logStatus = do
230 eParsed <- liftBase $ parseFile ft ff fp
231 case eParsed of
232 Right parsed -> do
233 flowCorpus u n la mfslw (Just $ fromIntegral $ length parsed, yieldMany parsed .| mapC toHyperdataDocument) logStatus
234 --let docs = splitEvery 500 $ take l parsed
235 --flowCorpus u n la mfslw (yieldMany $ map (map toHyperdataDocument) docs) logStatus
236 Left e -> panic $ "Error: " <> T.pack e
237
238 ------------------------------------------------------------------------
239 -- | TODO improve the needed type to create/update a corpus
240 -- (For now, Either is enough)
241 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
242 => User
243 -> Either CorpusName [CorpusId]
244 -> TermType Lang
245 -> Maybe FlowSocialListWith
246 -> (Maybe Integer, ConduitT () a m ())
247 -> (JobLog -> m ())
248 -> m CorpusId
249 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
250
251
252 flow :: forall env err m a c.
253 ( FlowCmdM env err m
254 , FlowCorpus a
255 , MkCorpus c
256 )
257 => Maybe c
258 -> User
259 -> Either CorpusName [CorpusId]
260 -> TermType Lang
261 -> Maybe FlowSocialListWith
262 -> (Maybe Integer, ConduitT () a m ())
263 -> (JobLog -> m ())
264 -> m CorpusId
265 flow c u cn la mfslw (mLength, docsC) logStatus = do
266 -- TODO if public insertMasterDocs else insertUserDocs
267 ids <- runConduit $ zipSources (yieldMany [1..]) docsC
268 .| CList.chunksOf 100
269 .| mapMC insertDocs'
270 .| CList.concat
271 .| sinkList
272 -- ids <- traverse (\(idx, doc) -> do
273 -- id <- insertMasterDocs c la doc
274 -- logStatus JobLog { _scst_succeeded = Just $ 1 + idx
275 -- , _scst_failed = Just 0
276 -- , _scst_remaining = Just $ length docs - idx
277 -- , _scst_events = Just []
278 -- }
279 -- pure id
280 -- ) (zip [1..] docs)
281 flowCorpusUser (la ^. tt_lang) u cn c ids mfslw
282
283 where
284 insertDocs' :: [(Integer, a)] -> m [NodeId]
285 insertDocs' [] = pure []
286 insertDocs' docs = do
287 printDebug "[flow] calling insertDoc, ([idx], mLength) = " (fst <$> docs, mLength)
288 ids <- insertMasterDocs c la (snd <$> docs)
289 let maxIdx = maximum (fst <$> docs)
290 case mLength of
291 Nothing -> pure ()
292 Just len -> do
293 logStatus JobLog { _scst_succeeded = Just $ fromIntegral $ 1 + maxIdx
294 , _scst_failed = Just 0
295 , _scst_remaining = Just $ fromIntegral $ len - maxIdx
296 , _scst_events = Just []
297 }
298 pure ids
299
300
301
302 ------------------------------------------------------------------------
303 flowCorpusUser :: ( FlowCmdM env err m
304 , MkCorpus c
305 )
306 => Lang
307 -> User
308 -> Either CorpusName [CorpusId]
309 -> Maybe c
310 -> [ContextId]
311 -> Maybe FlowSocialListWith
312 -> m CorpusId
313 flowCorpusUser l user corpusName ctype ids mfslw = do
314 -- User Flow
315 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
316 -- NodeTexts is first
317 _tId <- insertDefaultNodeIfNotExists NodeTexts userCorpusId userId
318 -- printDebug "NodeTexts: " tId
319
320 -- NodeList is second
321 listId <- getOrMkList userCorpusId userId
322 -- _cooc <- insertDefaultNode NodeListCooc listId userId
323 -- TODO: check if present already, ignore
324 _ <- Doc.add userCorpusId ids
325
326 -- printDebug "Node Text Ids:" tId
327
328 -- User List Flow
329 (masterUserId, _masterRootId, masterCorpusId)
330 <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
331
332 --let gp = (GroupParams l 2 3 (StopSize 3))
333 -- Here the PosTagAlgo should be chosen according to the Lang
334 _ <- case mfslw of
335 (Just (NoList _)) -> do
336 printDebug "Do not build list" mfslw
337 pure ()
338 _ -> do
339 ngs <- buildNgramsLists user userCorpusId masterCorpusId mfslw
340 $ GroupWithPosTag l CoreNLP HashMap.empty
341
342 -- printDebug "flowCorpusUser:ngs" ngs
343
344 _userListId <- flowList_DbRepo listId ngs
345 _mastListId <- getOrMkList masterCorpusId masterUserId
346 pure ()
347 -- _ <- insertOccsUpdates userCorpusId mastListId
348 -- printDebug "userListId" userListId
349 -- User Graph Flow
350 _ <- insertDefaultNodeIfNotExists NodeGraph userCorpusId userId
351 _ <- insertDefaultNodeIfNotExists NodeDashboard userCorpusId userId
352 --_ <- mkPhylo userCorpusId userId
353 -- Annuaire Flow
354 -- _ <- mkAnnuaire rootUserId userId
355 _ <- updateNgramsOccurrences userCorpusId (Just listId)
356
357 pure userCorpusId
358
359
360 insertMasterDocs :: ( FlowCmdM env err m
361 , FlowCorpus a
362 , MkCorpus c
363 )
364 => Maybe c
365 -> TermType Lang
366 -> [a]
367 -> m [DocId]
368 insertMasterDocs c lang hs = do
369 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
370 (ids', documentsWithId) <- insertDocs masterUserId masterCorpusId (map (toNode masterUserId Nothing) hs )
371 _ <- Doc.add masterCorpusId ids'
372 -- TODO
373 -- create a corpus with database name (CSV or PubMed)
374 -- add documents to the corpus (create node_node link)
375 -- this will enable global database monitoring
376
377 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
378 mapNgramsDocs' :: HashMap ExtractedNgrams (Map NgramsType (Map NodeId (Int, TermsCount)))
379 <- mapNodeIdNgrams
380 <$> documentIdWithNgrams
381 (extractNgramsT $ withLang lang documentsWithId)
382 documentsWithId
383
384 lId <- getOrMkList masterCorpusId masterUserId
385 -- _ <- saveDocNgramsWith lId mapNgramsDocs'
386 _ <- saveDocNgramsWith lId mapNgramsDocs'
387
388 -- _cooc <- insertDefaultNode NodeListCooc lId masterUserId
389 pure ids'
390
391 saveDocNgramsWith :: (FlowCmdM env err m)
392 => ListId
393 -> HashMap ExtractedNgrams (Map NgramsType (Map NodeId (Int, TermsCount)))
394 -> m ()
395 saveDocNgramsWith lId mapNgramsDocs' = do
396 --printDebug "[saveDocNgramsWith] mapNgramsDocs'" mapNgramsDocs'
397 let mapNgramsDocsNoCount = over (traverse . traverse . traverse) fst mapNgramsDocs'
398 terms2id <- insertExtractedNgrams $ HashMap.keys mapNgramsDocsNoCount
399
400 let mapNgramsDocs = HashMap.mapKeys extracted2ngrams mapNgramsDocs'
401
402 -- new
403 mapCgramsId <- listInsertDb lId toNodeNgramsW'
404 $ map (first _ngramsTerms . second Map.keys)
405 $ HashMap.toList mapNgramsDocs
406
407 --printDebug "saveDocNgramsWith" mapCgramsId
408 -- insertDocNgrams
409 _return <- insertContextNodeNgrams2
410 $ catMaybes [ ContextNodeNgrams2 <$> Just nId
411 <*> (getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms''))
412 <*> Just (fromIntegral w :: Double)
413 | (terms'', mapNgramsTypes) <- HashMap.toList mapNgramsDocs
414 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
415 , (nId, (w, _cnt)) <- Map.toList mapNodeIdWeight
416 ]
417
418 -- to be removed
419 _ <- insertDocNgrams lId $ HashMap.mapKeys (indexNgrams terms2id) mapNgramsDocs
420
421 pure ()
422
423
424 ------------------------------------------------------------------------
425 -- TODO Type NodeDocumentUnicised
426 insertDocs :: ( FlowCmdM env err m
427 -- , FlowCorpus a
428 , FlowInsertDB a
429 )
430 => UserId
431 -> CorpusId
432 -> [a]
433 -> m ([ContextId], [Indexed ContextId a])
434 insertDocs uId cId hs = do
435 let docs = map addUniqId hs
436 newIds <- insertDb uId Nothing docs
437 -- printDebug "newIds" newIds
438 let
439 newIds' = map reId newIds
440 documentsWithId = mergeData (toInserted newIds) (Map.fromList $ map viewUniqId' docs)
441 _ <- Doc.add cId newIds'
442 pure (newIds', documentsWithId)
443
444
445 ------------------------------------------------------------------------
446 viewUniqId' :: UniqId a
447 => a
448 -> (Hash, a)
449 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
450 where
451 err = panic "[ERROR] Database.Flow.toInsert"
452
453
454 toInserted :: [ReturnId]
455 -> Map Hash ReturnId
456 toInserted =
457 Map.fromList . map (\r -> (reUniqId r, r) )
458 . filter (\r -> reInserted r == True)
459
460 mergeData :: Map Hash ReturnId
461 -> Map Hash a
462 -> [Indexed NodeId a]
463 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
464 where
465 toDocumentWithId (sha,hpd) =
466 Indexed <$> fmap reId (lookup sha rs)
467 <*> Just hpd
468
469 ------------------------------------------------------------------------
470 ------------------------------------------------------------------------
471 ------------------------------------------------------------------------
472 documentIdWithNgrams :: HasNodeError err
473 => (a
474 -> Cmd err (HashMap b (Map NgramsType Int, TermsCount)))
475 -> [Indexed NodeId a]
476 -> Cmd err [DocumentIdWithNgrams a b]
477 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
478 where
479 toDocumentIdWithNgrams d = do
480 e <- f $ _unIndex d
481 pure $ DocumentIdWithNgrams d e
482
483
484 -- | TODO check optimization
485 mapNodeIdNgrams :: (Ord b, Hashable b)
486 => [DocumentIdWithNgrams a b]
487 -> HashMap b
488 (Map NgramsType
489 (Map NodeId (Int, TermsCount))
490 )
491 mapNodeIdNgrams = HashMap.unionsWith (Map.unionWith (Map.unionWith addTuples)) . fmap f
492 where
493 -- | NOTE We are somehow multiplying 'TermsCount' here: If the
494 -- same ngrams term has different ngrams types, the 'TermsCount'
495 -- for it (which is the number of times the terms appears in a
496 -- document) is copied over to all its types.
497 f :: DocumentIdWithNgrams a b
498 -> HashMap b (Map NgramsType (Map NodeId (Int, TermsCount)))
499 f d = fmap (\(ngramsTypeMap, cnt) -> fmap (\i -> Map.singleton nId (i, cnt)) ngramsTypeMap) $ documentNgrams d
500 where
501 nId = _index $ documentWithId d
502
503
504 ------------------------------------------------------------------------
505 instance ExtractNgramsT HyperdataContact
506 where
507 extractNgramsT l hc = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extract l hc
508 where
509 extract :: TermType Lang -> HyperdataContact
510 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int, TermsCount))
511 extract _l hc' = do
512 let authors = map text2ngrams
513 $ maybe ["Nothing"] (\a -> [a])
514 $ view (hc_who . _Just . cw_lastName) hc'
515
516 pure $ HashMap.fromList $ [(SimpleNgrams a', (Map.singleton Authors 1, 1)) | a' <- authors ]
517
518
519 instance ExtractNgramsT HyperdataDocument
520 where
521 extractNgramsT :: TermType Lang
522 -> HyperdataDocument
523 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int, TermsCount))
524 extractNgramsT lang hd = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extractNgramsT' lang hd
525 where
526 extractNgramsT' :: TermType Lang
527 -> HyperdataDocument
528 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int, TermsCount))
529 extractNgramsT' lang' doc = do
530 let source = text2ngrams
531 $ maybe "Nothing" identity
532 $ _hd_source doc
533
534 institutes = map text2ngrams
535 $ maybe ["Nothing"] (map toSchoolName . (T.splitOn ", "))
536 $ _hd_institutes doc
537
538 authors = map text2ngrams
539 $ maybe ["Nothing"] (T.splitOn ", ")
540 $ _hd_authors doc
541
542 termsWithCounts' <- map (\(t, cnt) -> (enrichedTerms (lang' ^. tt_lang) CoreNLP NP t, cnt))
543 <$> concat
544 <$> liftBase (extractTerms lang' $ hasText doc)
545
546 pure $ HashMap.fromList
547 $ [(SimpleNgrams source, (Map.singleton Sources 1, 1)) ]
548 <> [(SimpleNgrams i', (Map.singleton Institutes 1, 1)) | i' <- institutes ]
549 <> [(SimpleNgrams a', (Map.singleton Authors 1, 1)) | a' <- authors ]
550 <> [(EnrichedNgrams t', (Map.singleton NgramsTerms 1, cnt')) | (t', cnt') <- termsWithCounts' ]
551
552 instance (ExtractNgramsT a, HasText a) => ExtractNgramsT (Node a)
553 where
554 extractNgramsT l (Node { _node_hyperdata = h }) = extractNgramsT l h
555
556 instance HasText a => HasText (Node a)
557 where
558 hasText (Node { _node_hyperdata = h }) = hasText h
559
560
561
562 -- | TODO putelsewhere
563 -- | Upgrade function
564 -- Suppose all documents are English (this is the case actually)
565 indexAllDocumentsWithPosTag :: FlowCmdM env err m
566 => m ()
567 indexAllDocumentsWithPosTag = do
568 rootId <- getRootId (UserName userMaster)
569 corpusIds <- findNodesId rootId [NodeCorpus]
570 docs <- List.concat <$> mapM getDocumentsWithParentId corpusIds
571 _ <- mapM extractInsert (splitEvery 1000 docs)
572 pure ()
573
574 extractInsert :: FlowCmdM env err m
575 => [Node HyperdataDocument] -> m ()
576 extractInsert docs = do
577 let documentsWithId = map (\doc -> Indexed (doc ^. node_id) doc) docs
578 mapNgramsDocs' <- mapNodeIdNgrams
579 <$> documentIdWithNgrams
580 (extractNgramsT $ withLang (Multi EN) documentsWithId)
581 documentsWithId
582 _ <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
583 pure ()