]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Flow.hs
Merge branch 'dev-phylo' of https://gitlab.iscpif.fr/gargantext/haskell-gargantext...
[gargantext.git] / src / Gargantext / Database / Action / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -- TODO-ACCESS:
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
13
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
16 -}
17
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
19
20 {-# LANGUAGE ConstrainedClassMethods #-}
21 {-# LANGUAGE ConstraintKinds #-}
22 {-# LANGUAGE InstanceSigs #-}
23 {-# LANGUAGE ScopedTypeVariables #-}
24 {-# LANGUAGE TemplateHaskell #-}
25
26 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
27 ( DataText(..)
28 , getDataText
29 , getDataText_Debug
30 , flowDataText
31 , flow
32
33 , flowCorpusFile
34 , flowCorpus
35 , flowCorpusUser
36 , flowAnnuaire
37 , insertMasterDocs
38 , saveDocNgramsWith
39
40 , getOrMkRoot
41 , getOrMk_RootWithCorpus
42 , TermType(..)
43 , DataOrigin(..)
44 , allDataOrigins
45
46 , do_api
47 , indexAllDocumentsWithPosTag
48 )
49 where
50
51 import Conduit
52 import Control.Lens ((^.), view, _Just, makeLenses, over, traverse)
53 import Control.Monad.Reader (MonadReader)
54 import Data.Aeson.TH (deriveJSON)
55 import Data.Conduit.Internal (zipSources)
56 import qualified Data.Conduit.List as CList
57 import Data.Either
58 import Data.HashMap.Strict (HashMap)
59 import Data.Hashable (Hashable)
60 import Data.List (concat)
61 import Data.Map.Strict (Map, lookup)
62 import Data.Maybe (catMaybes)
63 import Data.Monoid
64 import Data.Swagger
65 import qualified Data.Text as T
66 import Data.Tuple.Extra (first, second)
67 import GHC.Generics (Generic)
68 import Servant.Client (ClientError)
69 import System.FilePath (FilePath)
70 import qualified Data.HashMap.Strict as HashMap
71 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
72 import qualified Data.Map.Strict as Map
73 import qualified Data.Conduit.List as CL
74 import qualified Data.Conduit as C
75
76 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..))
77 import Gargantext.Core (Lang(..), PosTagAlgo(..))
78 -- import Gargantext.Core.Ext.IMT (toSchoolName)
79 import Gargantext.Core.Ext.IMTUser (readFile_Annuaire)
80 import Gargantext.Core.Flow.Types
81 import Gargantext.Core.Text
82 import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat, FileType, splitOn)
83 import Gargantext.Core.Text.List (buildNgramsLists)
84 import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
85 import Gargantext.Core.Text.List.Social (FlowSocialListWith(..))
86 import Gargantext.Core.Text.Terms
87 import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
88 import Gargantext.Core.Types (POS(NP), TermsCount)
89 import Gargantext.Core.Types.Individu (User(..))
90 import Gargantext.Core.Types.Main
91 import Gargantext.Core.Utils (addTuples)
92 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
93 import Gargantext.Database.Action.Flow.List
94 import Gargantext.Database.Action.Flow.Types
95 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams, DocumentIdWithNgrams(..))
96 import Gargantext.Database.Action.Search (searchDocInDatabase)
97 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
98 import Gargantext.Database.Action.Metrics (updateNgramsOccurrences)
99 import Gargantext.Database.Admin.Types.Hyperdata
100 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
101 import Gargantext.Database.Prelude
102 import Gargantext.Database.Query.Table.ContextNodeNgrams2
103 import Gargantext.Database.Query.Table.Ngrams
104 import Gargantext.Database.Query.Table.Node
105 import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
106 import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
107 import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
108 import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
109 import Gargantext.Database.Schema.Node (NodePoly(..), node_id)
110 import Gargantext.Database.Types
111 import Gargantext.Prelude
112 import Gargantext.Prelude.Crypto.Hash (Hash)
113 import qualified Gargantext.Core.Text.Corpus.API as API
114 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
115 --import qualified Prelude
116
117 ------------------------------------------------------------------------
118 -- Imports for upgrade function
119 import Gargantext.Database.Query.Tree.Root (getRootId)
120 import Gargantext.Database.Query.Tree (findNodesId)
121 import qualified Data.List as List
122 ------------------------------------------------------------------------
123 -- TODO use internal with API name (could be old data)
124 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
125 | ExternalOrigin { _do_api :: API.ExternalAPIs }
126 -- TODO Web
127 deriving (Generic, Eq)
128
129 makeLenses ''DataOrigin
130 deriveJSON (unPrefix "_do_") ''DataOrigin
131 instance ToSchema DataOrigin where
132 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
133
134 allDataOrigins :: ( MonadReader env m
135 , HasConfig env) => m [DataOrigin]
136 allDataOrigins = do
137 ext <- API.externalAPIs
138
139 pure $ map InternalOrigin ext
140 <> map ExternalOrigin ext
141
142 ---------------
143 data DataText = DataOld ![NodeId]
144 | DataNew !(Maybe Integer, ConduitT () HyperdataDocument IO ())
145 --- | DataNew ![[HyperdataDocument]]
146
147 -- Show instance is not possible because of IO
148 printDataText :: DataText -> IO ()
149 printDataText (DataOld xs) = putStrLn $ show xs
150 printDataText (DataNew (maybeInt, conduitData)) = do
151 res <- C.runConduit (conduitData .| CL.consume)
152 putStrLn $ show (maybeInt, res)
153
154 -- TODO use the split parameter in config file
155 getDataText :: FlowCmdM env err m
156 => DataOrigin
157 -> TermType Lang
158 -> API.Query
159 -> Maybe API.Limit
160 -> m (Either ClientError DataText)
161 getDataText (ExternalOrigin api) la q li = liftBase $ do
162 eRes <- API.get api (_tt_lang la) q li
163 pure $ DataNew <$> eRes
164
165 getDataText (InternalOrigin _) _la q _li = do
166 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
167 (UserName userMaster)
168 (Left "")
169 (Nothing :: Maybe HyperdataCorpus)
170 ids <- map fst <$> searchDocInDatabase cId (stemIt q)
171 pure $ Right $ DataOld ids
172
173 getDataText_Debug :: FlowCmdM env err m
174 => DataOrigin
175 -> TermType Lang
176 -> API.Query
177 -> Maybe API.Limit
178 -> m ()
179 getDataText_Debug a l q li = do
180 result <- getDataText a l q li
181 case result of
182 Left err -> liftBase $ putStrLn $ show err
183 Right res -> liftBase $ printDataText res
184
185
186 -------------------------------------------------------------------------------
187 flowDataText :: forall env err m.
188 ( FlowCmdM env err m
189 )
190 => User
191 -> DataText
192 -> TermType Lang
193 -> CorpusId
194 -> Maybe FlowSocialListWith
195 -> (JobLog -> m ())
196 -> m CorpusId
197 flowDataText u (DataOld ids) tt cid mfslw _ = do
198 (_userId, userCorpusId, listId) <- createNodes u (Right [cid]) corpusType
199 _ <- Doc.add userCorpusId ids
200 flowCorpusUser (_tt_lang tt) u userCorpusId listId corpusType mfslw
201 where
202 corpusType = (Nothing :: Maybe HyperdataCorpus)
203 flowDataText u (DataNew (mLen, txtC)) tt cid mfslw logStatus =
204 flowCorpus u (Right [cid]) tt mfslw (mLen, (transPipe liftBase txtC)) logStatus
205
206 ------------------------------------------------------------------------
207 -- TODO use proxy
208 flowAnnuaire :: (FlowCmdM env err m)
209 => User
210 -> Either CorpusName [CorpusId]
211 -> (TermType Lang)
212 -> FilePath
213 -> (JobLog -> m ())
214 -> m AnnuaireId
215 flowAnnuaire u n l filePath logStatus = do
216 -- TODO Conduit for file
217 docs <- liftBase $ ((readFile_Annuaire filePath) :: IO [HyperdataContact])
218 flow (Nothing :: Maybe HyperdataAnnuaire) u n l Nothing (Just $ fromIntegral $ length docs, yieldMany docs) logStatus
219
220 ------------------------------------------------------------------------
221 flowCorpusFile :: (FlowCmdM env err m)
222 => User
223 -> Either CorpusName [CorpusId]
224 -> Limit -- Limit the number of docs (for dev purpose)
225 -> TermType Lang
226 -> FileType
227 -> FileFormat
228 -> FilePath
229 -> Maybe FlowSocialListWith
230 -> (JobLog -> m ())
231 -> m CorpusId
232 flowCorpusFile u n _l la ft ff fp mfslw logStatus = do
233 eParsed <- liftBase $ parseFile ft ff fp
234 case eParsed of
235 Right parsed -> do
236 flowCorpus u n la mfslw (Just $ fromIntegral $ length parsed, yieldMany parsed .| mapC toHyperdataDocument) logStatus
237 --let docs = splitEvery 500 $ take l parsed
238 --flowCorpus u n la mfslw (yieldMany $ map (map toHyperdataDocument) docs) logStatus
239 Left e -> panic $ "Error: " <> T.pack e
240
241 ------------------------------------------------------------------------
242 -- | TODO improve the needed type to create/update a corpus
243 -- (For now, Either is enough)
244 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
245 => User
246 -> Either CorpusName [CorpusId]
247 -> TermType Lang
248 -> Maybe FlowSocialListWith
249 -> (Maybe Integer, ConduitT () a m ())
250 -> (JobLog -> m ())
251 -> m CorpusId
252 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
253
254
255 flow :: forall env err m a c.
256 ( FlowCmdM env err m
257 , FlowCorpus a
258 , MkCorpus c
259 )
260 => Maybe c
261 -> User
262 -> Either CorpusName [CorpusId]
263 -> TermType Lang
264 -> Maybe FlowSocialListWith
265 -> (Maybe Integer, ConduitT () a m ())
266 -> (JobLog -> m ())
267 -> m CorpusId
268 flow c u cn la mfslw (mLength, docsC) logStatus = do
269 (_userId, userCorpusId, listId) <- createNodes u cn c
270 -- TODO if public insertMasterDocs else insertUserDocs
271 _ <- runConduit $ zipSources (yieldMany [1..]) docsC
272 .| CList.chunksOf 100
273 .| mapMC insertDocs'
274 .| mapM_C (\ids' -> do
275 _ <- Doc.add userCorpusId ids'
276 pure ())
277 .| sinkList
278
279 _ <- flowCorpusUser (la ^. tt_lang) u userCorpusId listId c mfslw
280
281 -- ids <- traverse (\(idx, doc) -> do
282 -- id <- insertMasterDocs c la doc
283 -- logStatus JobLog { _scst_succeeded = Just $ 1 + idx
284 -- , _scst_failed = Just 0
285 -- , _scst_remaining = Just $ length docs - idx
286 -- , _scst_events = Just []
287 -- }
288 -- pure id
289 -- ) (zip [1..] docs)
290 --printDebug "[flow] calling flowCorpusUser" (0 :: Int)
291 pure userCorpusId
292 --flowCorpusUser (la ^. tt_lang) u cn c ids mfslw
293
294 where
295 insertDocs' :: [(Integer, a)] -> m [NodeId]
296 insertDocs' [] = pure []
297 insertDocs' docs = do
298 -- printDebug "[flow] calling insertDoc, ([idx], mLength) = " (fst <$> docs, mLength)
299 ids <- insertMasterDocs c la (snd <$> docs)
300 let maxIdx = maximum (fst <$> docs)
301 case mLength of
302 Nothing -> pure ()
303 Just len -> do
304 logStatus JobLog { _scst_succeeded = Just $ fromIntegral $ 1 + maxIdx
305 , _scst_failed = Just 0
306 , _scst_remaining = Just $ fromIntegral $ len - maxIdx
307 , _scst_events = Just []
308 }
309 pure ids
310
311
312
313 ------------------------------------------------------------------------
314 createNodes :: ( FlowCmdM env err m
315 , MkCorpus c
316 )
317 => User
318 -> Either CorpusName [CorpusId]
319 -> Maybe c
320 -> m (UserId, CorpusId, ListId)
321 createNodes user corpusName ctype = do
322 -- User Flow
323 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
324 -- NodeTexts is first
325 _tId <- insertDefaultNodeIfNotExists NodeTexts userCorpusId userId
326 -- printDebug "NodeTexts: " tId
327
328 -- NodeList is second
329 listId <- getOrMkList userCorpusId userId
330
331 -- User Graph Flow
332 _ <- insertDefaultNodeIfNotExists NodeGraph userCorpusId userId
333 _ <- insertDefaultNodeIfNotExists NodeDashboard userCorpusId userId
334
335 pure (userId, userCorpusId, listId)
336
337
338 flowCorpusUser :: ( FlowCmdM env err m
339 , MkCorpus c
340 )
341 => Lang
342 -> User
343 -> CorpusId
344 -> ListId
345 -> Maybe c
346 -> Maybe FlowSocialListWith
347 -> m CorpusId
348 flowCorpusUser l user userCorpusId listId ctype mfslw = do
349 -- User List Flow
350 (masterUserId, _masterRootId, masterCorpusId)
351 <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
352
353 --let gp = (GroupParams l 2 3 (StopSize 3))
354 -- Here the PosTagAlgo should be chosen according to the Lang
355 _ <- case mfslw of
356 (Just (NoList _)) -> do
357 -- printDebug "Do not build list" mfslw
358 pure ()
359 _ -> do
360 ngs <- buildNgramsLists user userCorpusId masterCorpusId mfslw
361 $ GroupWithPosTag l CoreNLP HashMap.empty
362
363 -- printDebug "flowCorpusUser:ngs" ngs
364
365 _userListId <- flowList_DbRepo listId ngs
366 _mastListId <- getOrMkList masterCorpusId masterUserId
367 pure ()
368 -- _ <- insertOccsUpdates userCorpusId mastListId
369 -- printDebug "userListId" userListId
370 --_ <- mkPhylo userCorpusId userId
371 -- Annuaire Flow
372 -- _ <- mkAnnuaire rootUserId userId
373 _ <- updateNgramsOccurrences userCorpusId (Just listId)
374
375 pure userCorpusId
376
377
378 insertMasterDocs :: ( FlowCmdM env err m
379 , FlowCorpus a
380 , MkCorpus c
381 )
382 => Maybe c
383 -> TermType Lang
384 -> [a]
385 -> m [DocId]
386 insertMasterDocs c lang hs = do
387 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
388 (ids', documentsWithId) <- insertDocs masterUserId masterCorpusId (map (toNode masterUserId Nothing) hs )
389 _ <- Doc.add masterCorpusId ids'
390 -- TODO
391 -- create a corpus with database name (CSV or PubMed)
392 -- add documents to the corpus (create node_node link)
393 -- this will enable global database monitoring
394
395 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
396 mapNgramsDocs' :: HashMap ExtractedNgrams (Map NgramsType (Map NodeId (Int, TermsCount)))
397 <- mapNodeIdNgrams
398 <$> documentIdWithNgrams
399 (extractNgramsT $ withLang lang documentsWithId)
400 documentsWithId
401
402 lId <- getOrMkList masterCorpusId masterUserId
403 -- _ <- saveDocNgramsWith lId mapNgramsDocs'
404 _ <- saveDocNgramsWith lId mapNgramsDocs'
405
406 -- _cooc <- insertDefaultNode NodeListCooc lId masterUserId
407 pure ids'
408
409 saveDocNgramsWith :: (FlowCmdM env err m)
410 => ListId
411 -> HashMap ExtractedNgrams (Map NgramsType (Map NodeId (Int, TermsCount)))
412 -> m ()
413 saveDocNgramsWith lId mapNgramsDocs' = do
414 --printDebug "[saveDocNgramsWith] mapNgramsDocs'" mapNgramsDocs'
415 let mapNgramsDocsNoCount = over (traverse . traverse . traverse) fst mapNgramsDocs'
416 terms2id <- insertExtractedNgrams $ HashMap.keys mapNgramsDocsNoCount
417
418 let mapNgramsDocs = HashMap.mapKeys extracted2ngrams mapNgramsDocs'
419
420 -- new
421 mapCgramsId <- listInsertDb lId toNodeNgramsW'
422 $ map (first _ngramsTerms . second Map.keys)
423 $ HashMap.toList mapNgramsDocs
424
425 --printDebug "saveDocNgramsWith" mapCgramsId
426 -- insertDocNgrams
427 let ngrams2insert = catMaybes [ ContextNodeNgrams2 <$> Just nId
428 <*> (getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms''))
429 <*> Just (fromIntegral w :: Double)
430 | (terms'', mapNgramsTypes) <- HashMap.toList mapNgramsDocs
431 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
432 , (nId, (w, _cnt)) <- Map.toList mapNodeIdWeight
433 ]
434 -- printDebug "Ngrams2Insert" ngrams2insert
435 _return <- insertContextNodeNgrams2 ngrams2insert
436
437 -- to be removed
438 _ <- insertDocNgrams lId $ HashMap.mapKeys (indexNgrams terms2id) mapNgramsDocs
439
440 pure ()
441
442
443 ------------------------------------------------------------------------
444 -- TODO Type NodeDocumentUnicised
445 insertDocs :: ( FlowCmdM env err m
446 -- , FlowCorpus a
447 , FlowInsertDB a
448 )
449 => UserId
450 -> CorpusId
451 -> [a]
452 -> m ([ContextId], [Indexed ContextId a])
453 insertDocs uId cId hs = do
454 let docs = map addUniqId hs
455 newIds <- insertDb uId Nothing docs
456 -- printDebug "newIds" newIds
457 let
458 newIds' = map reId newIds
459 documentsWithId = mergeData (toInserted newIds) (Map.fromList $ map viewUniqId' docs)
460 _ <- Doc.add cId newIds'
461 pure (newIds', documentsWithId)
462
463
464 ------------------------------------------------------------------------
465 viewUniqId' :: UniqId a
466 => a
467 -> (Hash, a)
468 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
469 where
470 err = panic "[ERROR] Database.Flow.toInsert"
471
472
473 toInserted :: [ReturnId]
474 -> Map Hash ReturnId
475 toInserted =
476 Map.fromList . map (\r -> (reUniqId r, r) )
477 . filter (\r -> reInserted r == True)
478
479 mergeData :: Map Hash ReturnId
480 -> Map Hash a
481 -> [Indexed NodeId a]
482 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
483 where
484 toDocumentWithId (sha,hpd) =
485 Indexed <$> fmap reId (lookup sha rs)
486 <*> Just hpd
487
488 ------------------------------------------------------------------------
489 ------------------------------------------------------------------------
490 ------------------------------------------------------------------------
491 documentIdWithNgrams :: HasNodeError err
492 => (a
493 -> Cmd err (HashMap b (Map NgramsType Int, TermsCount)))
494 -> [Indexed NodeId a]
495 -> Cmd err [DocumentIdWithNgrams a b]
496 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
497 where
498 toDocumentIdWithNgrams d = do
499 e <- f $ _unIndex d
500 pure $ DocumentIdWithNgrams d e
501
502
503 -- | TODO check optimization
504 mapNodeIdNgrams :: (Ord b, Hashable b)
505 => [DocumentIdWithNgrams a b]
506 -> HashMap b
507 (Map NgramsType
508 (Map NodeId (Int, TermsCount))
509 )
510 mapNodeIdNgrams = HashMap.unionsWith (Map.unionWith (Map.unionWith addTuples)) . fmap f
511 where
512 -- | NOTE We are somehow multiplying 'TermsCount' here: If the
513 -- same ngrams term has different ngrams types, the 'TermsCount'
514 -- for it (which is the number of times the terms appears in a
515 -- document) is copied over to all its types.
516 f :: DocumentIdWithNgrams a b
517 -> HashMap b (Map NgramsType (Map NodeId (Int, TermsCount)))
518 f d = fmap (\(ngramsTypeMap, cnt) -> fmap (\i -> Map.singleton nId (i, cnt)) ngramsTypeMap) $ documentNgrams d
519 where
520 nId = _index $ documentWithId d
521
522
523 ------------------------------------------------------------------------
524 instance ExtractNgramsT HyperdataContact
525 where
526 extractNgramsT l hc = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extract l hc
527 where
528 extract :: TermType Lang -> HyperdataContact
529 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int, TermsCount))
530 extract _l hc' = do
531 let authors = map text2ngrams
532 $ maybe ["Nothing"] (\a -> [a])
533 $ view (hc_who . _Just . cw_lastName) hc'
534
535 pure $ HashMap.fromList $ [(SimpleNgrams a', (Map.singleton Authors 1, 1)) | a' <- authors ]
536
537
538 instance ExtractNgramsT HyperdataDocument
539 where
540 extractNgramsT :: TermType Lang
541 -> HyperdataDocument
542 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int, TermsCount))
543 extractNgramsT lang hd = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extractNgramsT' lang hd
544 where
545 extractNgramsT' :: TermType Lang
546 -> HyperdataDocument
547 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int, TermsCount))
548 extractNgramsT' lang' doc = do
549 let source = text2ngrams
550 $ maybe "Nothing" identity
551 $ _hd_source doc
552
553 institutes = map text2ngrams
554 $ maybe ["Nothing"] (splitOn Institutes (doc^. hd_bdd))
555 $ _hd_institutes doc
556
557 authors = map text2ngrams
558 $ maybe ["Nothing"] (splitOn Authors (doc^. hd_bdd))
559 $ _hd_authors doc
560
561
562 termsWithCounts' <- map (\(t, cnt) -> (enrichedTerms (lang' ^. tt_lang) CoreNLP NP t, cnt))
563 <$> concat
564 <$> liftBase (extractTerms lang' $ hasText doc)
565
566 pure $ HashMap.fromList
567 $ [(SimpleNgrams source, (Map.singleton Sources 1, 1)) ]
568 <> [(SimpleNgrams i', (Map.singleton Institutes 1, 1)) | i' <- institutes ]
569 <> [(SimpleNgrams a', (Map.singleton Authors 1, 1)) | a' <- authors ]
570 <> [(EnrichedNgrams t', (Map.singleton NgramsTerms 1, cnt')) | (t', cnt') <- termsWithCounts' ]
571
572 instance (ExtractNgramsT a, HasText a) => ExtractNgramsT (Node a)
573 where
574 extractNgramsT l (Node { _node_hyperdata = h }) = extractNgramsT l h
575
576 instance HasText a => HasText (Node a)
577 where
578 hasText (Node { _node_hyperdata = h }) = hasText h
579
580
581
582 -- | TODO putelsewhere
583 -- | Upgrade function
584 -- Suppose all documents are English (this is the case actually)
585 indexAllDocumentsWithPosTag :: FlowCmdM env err m
586 => m ()
587 indexAllDocumentsWithPosTag = do
588 rootId <- getRootId (UserName userMaster)
589 corpusIds <- findNodesId rootId [NodeCorpus]
590 docs <- List.concat <$> mapM getDocumentsWithParentId corpusIds
591 _ <- mapM extractInsert (splitEvery 1000 docs)
592 pure ()
593
594 extractInsert :: FlowCmdM env err m
595 => [Node HyperdataDocument] -> m ()
596 extractInsert docs = do
597 let documentsWithId = map (\doc -> Indexed (doc ^. node_id) doc) docs
598 mapNgramsDocs' <- mapNodeIdNgrams
599 <$> documentIdWithNgrams
600 (extractNgramsT $ withLang (Multi EN) documentsWithId)
601 documentsWithId
602 _ <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
603 pure ()