]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Flow.hs
[facet] some fixes to tfidfAll, small refactoring
[gargantext.git] / src / Gargantext / Database / Action / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -- TODO-ACCESS:
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
13
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
16 -}
17
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
19
20 {-# LANGUAGE ConstrainedClassMethods #-}
21 {-# LANGUAGE ConstraintKinds #-}
22 {-# LANGUAGE InstanceSigs #-}
23 {-# LANGUAGE ScopedTypeVariables #-}
24 {-# LANGUAGE TemplateHaskell #-}
25
26 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
27 ( DataText(..)
28 , getDataText
29 , getDataText_Debug
30 , flowDataText
31 , flow
32
33 , flowCorpusFile
34 , flowCorpus
35 , flowCorpusUser
36 , flowAnnuaire
37 , insertMasterDocs
38 , saveDocNgramsWith
39
40 , getOrMkRoot
41 , getOrMk_RootWithCorpus
42 , TermType(..)
43 , DataOrigin(..)
44 , allDataOrigins
45
46 , do_api
47 , indexAllDocumentsWithPosTag
48 )
49 where
50
51 import Conduit
52 import Control.Lens ((^.), view, _Just, makeLenses, over, traverse)
53 import Control.Monad.Reader (MonadReader)
54 import Data.Aeson.TH (deriveJSON)
55 import Data.Conduit.Internal (zipSources)
56 import qualified Data.Conduit.List as CList
57 import Data.Either
58 import Data.HashMap.Strict (HashMap)
59 import Data.Hashable (Hashable)
60 import Data.List (concat)
61 import Data.Map.Strict (Map, lookup)
62 import Data.Maybe (catMaybes)
63 import Data.Monoid
64 import Data.Swagger
65 import qualified Data.Text as T
66 import Data.Tuple.Extra (first, second)
67 import GHC.Generics (Generic)
68 import Servant.Client (ClientError)
69 import System.FilePath (FilePath)
70 import qualified Data.HashMap.Strict as HashMap
71 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
72 import qualified Data.Map.Strict as Map
73 import qualified Data.Conduit.List as CL
74 import qualified Data.Conduit as C
75
76 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..))
77 import Gargantext.Core (Lang(..), PosTagAlgo(..))
78 -- import Gargantext.Core.Ext.IMT (toSchoolName)
79 import Gargantext.Core.Ext.IMTUser (readFile_Annuaire)
80 import Gargantext.Core.Flow.Types
81 import Gargantext.Core.NLP (nlpServerGet)
82 import Gargantext.Core.Text
83 import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat, FileType, splitOn)
84 import Gargantext.Core.Text.List (buildNgramsLists)
85 import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
86 import Gargantext.Core.Text.List.Social (FlowSocialListWith(..))
87 import Gargantext.Core.Text.Terms
88 import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
89 import Gargantext.Core.Types (POS(NP), TermsCount)
90 import Gargantext.Core.Types.Individu (User(..))
91 import Gargantext.Core.Types.Main
92 import Gargantext.Core.Utils (addTuples)
93 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
94 import Gargantext.Database.Action.Flow.List
95 import Gargantext.Database.Action.Flow.Types
96 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams, DocumentIdWithNgrams(..))
97 import Gargantext.Database.Action.Search (searchDocInDatabase)
98 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
99 import Gargantext.Database.Action.Metrics (updateNgramsOccurrences)
100 import Gargantext.Database.Admin.Types.Hyperdata
101 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
102 import Gargantext.Database.Prelude
103 import Gargantext.Database.Query.Table.ContextNodeNgrams2
104 import Gargantext.Database.Query.Table.Ngrams
105 import Gargantext.Database.Query.Table.Node
106 import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
107 import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
108 import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
109 import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
110 import Gargantext.Database.Schema.Node (NodePoly(..), node_id)
111 import Gargantext.Database.Types
112 import Gargantext.Prelude
113 import Gargantext.Prelude.Crypto.Hash (Hash)
114 import qualified Gargantext.Core.Text.Corpus.API as API
115 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
116 --import qualified Prelude
117
118 ------------------------------------------------------------------------
119 -- Imports for upgrade function
120 import Gargantext.Database.Query.Tree.Root (getRootId)
121 import Gargantext.Database.Query.Tree (findNodesId)
122 import qualified Data.List as List
123 ------------------------------------------------------------------------
124 -- TODO use internal with API name (could be old data)
125 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
126 | ExternalOrigin { _do_api :: API.ExternalAPIs }
127 -- TODO Web
128 deriving (Generic, Eq)
129
130 makeLenses ''DataOrigin
131 deriveJSON (unPrefix "_do_") ''DataOrigin
132 instance ToSchema DataOrigin where
133 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
134
135 allDataOrigins :: ( MonadReader env m
136 , HasConfig env) => m [DataOrigin]
137 allDataOrigins = do
138 ext <- API.externalAPIs
139
140 pure $ map InternalOrigin ext
141 <> map ExternalOrigin ext
142
143 ---------------
144 data DataText = DataOld ![NodeId]
145 | DataNew !(Maybe Integer, ConduitT () HyperdataDocument IO ())
146 --- | DataNew ![[HyperdataDocument]]
147
148 -- Show instance is not possible because of IO
149 printDataText :: DataText -> IO ()
150 printDataText (DataOld xs) = putStrLn $ show xs
151 printDataText (DataNew (maybeInt, conduitData)) = do
152 res <- C.runConduit (conduitData .| CL.consume)
153 putStrLn $ show (maybeInt, res)
154
155 -- TODO use the split parameter in config file
156 getDataText :: FlowCmdM env err m
157 => DataOrigin
158 -> TermType Lang
159 -> API.Query
160 -> Maybe API.Limit
161 -> m (Either ClientError DataText)
162 getDataText (ExternalOrigin api) la q li = liftBase $ do
163 eRes <- API.get api (_tt_lang la) q li
164 pure $ DataNew <$> eRes
165
166 getDataText (InternalOrigin _) _la q _li = do
167 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
168 (UserName userMaster)
169 (Left "")
170 (Nothing :: Maybe HyperdataCorpus)
171 ids <- map fst <$> searchDocInDatabase cId (stemIt q)
172 pure $ Right $ DataOld ids
173
174 getDataText_Debug :: FlowCmdM env err m
175 => DataOrigin
176 -> TermType Lang
177 -> API.Query
178 -> Maybe API.Limit
179 -> m ()
180 getDataText_Debug a l q li = do
181 result <- getDataText a l q li
182 case result of
183 Left err -> liftBase $ putStrLn $ show err
184 Right res -> liftBase $ printDataText res
185
186
187 -------------------------------------------------------------------------------
188 flowDataText :: forall env err m.
189 ( FlowCmdM env err m
190 )
191 => User
192 -> DataText
193 -> TermType Lang
194 -> CorpusId
195 -> Maybe FlowSocialListWith
196 -> (JobLog -> m ())
197 -> m CorpusId
198 flowDataText u (DataOld ids) tt cid mfslw _ = do
199 (_userId, userCorpusId, listId) <- createNodes u (Right [cid]) corpusType
200 _ <- Doc.add userCorpusId ids
201 flowCorpusUser (_tt_lang tt) u userCorpusId listId corpusType mfslw
202 where
203 corpusType = (Nothing :: Maybe HyperdataCorpus)
204 flowDataText u (DataNew (mLen, txtC)) tt cid mfslw logStatus =
205 flowCorpus u (Right [cid]) tt mfslw (mLen, (transPipe liftBase txtC)) logStatus
206
207 ------------------------------------------------------------------------
208 -- TODO use proxy
209 flowAnnuaire :: (FlowCmdM env err m)
210 => User
211 -> Either CorpusName [CorpusId]
212 -> (TermType Lang)
213 -> FilePath
214 -> (JobLog -> m ())
215 -> m AnnuaireId
216 flowAnnuaire u n l filePath logStatus = do
217 -- TODO Conduit for file
218 docs <- liftBase $ ((readFile_Annuaire filePath) :: IO [HyperdataContact])
219 flow (Nothing :: Maybe HyperdataAnnuaire) u n l Nothing (Just $ fromIntegral $ length docs, yieldMany docs) logStatus
220
221 ------------------------------------------------------------------------
222 flowCorpusFile :: (FlowCmdM env err m)
223 => User
224 -> Either CorpusName [CorpusId]
225 -> Limit -- Limit the number of docs (for dev purpose)
226 -> TermType Lang
227 -> FileType
228 -> FileFormat
229 -> FilePath
230 -> Maybe FlowSocialListWith
231 -> (JobLog -> m ())
232 -> m CorpusId
233 flowCorpusFile u n _l la ft ff fp mfslw logStatus = do
234 eParsed <- liftBase $ parseFile ft ff fp
235 case eParsed of
236 Right parsed -> do
237 flowCorpus u n la mfslw (Just $ fromIntegral $ length parsed, yieldMany parsed .| mapC toHyperdataDocument) logStatus
238 --let docs = splitEvery 500 $ take l parsed
239 --flowCorpus u n la mfslw (yieldMany $ map (map toHyperdataDocument) docs) logStatus
240 Left e -> panic $ "Error: " <> T.pack e
241
242 ------------------------------------------------------------------------
243 -- | TODO improve the needed type to create/update a corpus
244 -- (For now, Either is enough)
245 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
246 => User
247 -> Either CorpusName [CorpusId]
248 -> TermType Lang
249 -> Maybe FlowSocialListWith
250 -> (Maybe Integer, ConduitT () a m ())
251 -> (JobLog -> m ())
252 -> m CorpusId
253 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
254
255
256 flow :: forall env err m a c.
257 ( FlowCmdM env err m
258 , FlowCorpus a
259 , MkCorpus c
260 )
261 => Maybe c
262 -> User
263 -> Either CorpusName [CorpusId]
264 -> TermType Lang
265 -> Maybe FlowSocialListWith
266 -> (Maybe Integer, ConduitT () a m ())
267 -> (JobLog -> m ())
268 -> m CorpusId
269 flow c u cn la mfslw (mLength, docsC) logStatus = do
270 (_userId, userCorpusId, listId) <- createNodes u cn c
271 -- TODO if public insertMasterDocs else insertUserDocs
272 _ <- runConduit $ zipSources (yieldMany [1..]) docsC
273 .| CList.chunksOf 100
274 .| mapMC insertDocs'
275 .| mapM_C (\ids' -> do
276 _ <- Doc.add userCorpusId ids'
277 pure ())
278 .| sinkList
279
280 _ <- flowCorpusUser (la ^. tt_lang) u userCorpusId listId c mfslw
281
282 -- ids <- traverse (\(idx, doc) -> do
283 -- id <- insertMasterDocs c la doc
284 -- logStatus JobLog { _scst_succeeded = Just $ 1 + idx
285 -- , _scst_failed = Just 0
286 -- , _scst_remaining = Just $ length docs - idx
287 -- , _scst_events = Just []
288 -- }
289 -- pure id
290 -- ) (zip [1..] docs)
291 --printDebug "[flow] calling flowCorpusUser" (0 :: Int)
292 pure userCorpusId
293 --flowCorpusUser (la ^. tt_lang) u cn c ids mfslw
294
295 where
296 insertDocs' :: [(Integer, a)] -> m [NodeId]
297 insertDocs' [] = pure []
298 insertDocs' docs = do
299 -- printDebug "[flow] calling insertDoc, ([idx], mLength) = " (fst <$> docs, mLength)
300 ids <- insertMasterDocs c la (snd <$> docs)
301 let maxIdx = maximum (fst <$> docs)
302 case mLength of
303 Nothing -> pure ()
304 Just len -> do
305 logStatus JobLog { _scst_succeeded = Just $ fromIntegral $ 1 + maxIdx
306 , _scst_failed = Just 0
307 , _scst_remaining = Just $ fromIntegral $ len - maxIdx
308 , _scst_events = Just []
309 }
310 pure ids
311
312
313
314 ------------------------------------------------------------------------
315 createNodes :: ( FlowCmdM env err m
316 , MkCorpus c
317 )
318 => User
319 -> Either CorpusName [CorpusId]
320 -> Maybe c
321 -> m (UserId, CorpusId, ListId)
322 createNodes user corpusName ctype = do
323 -- User Flow
324 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
325 -- NodeTexts is first
326 _tId <- insertDefaultNodeIfNotExists NodeTexts userCorpusId userId
327 -- printDebug "NodeTexts: " tId
328
329 -- NodeList is second
330 listId <- getOrMkList userCorpusId userId
331
332 -- User Graph Flow
333 _ <- insertDefaultNodeIfNotExists NodeGraph userCorpusId userId
334 _ <- insertDefaultNodeIfNotExists NodeDashboard userCorpusId userId
335
336 pure (userId, userCorpusId, listId)
337
338
339 flowCorpusUser :: ( FlowCmdM env err m
340 , MkCorpus c
341 )
342 => Lang
343 -> User
344 -> CorpusId
345 -> ListId
346 -> Maybe c
347 -> Maybe FlowSocialListWith
348 -> m CorpusId
349 flowCorpusUser l user userCorpusId listId ctype mfslw = do
350 server <- view (nlpServerGet l)
351 -- User List Flow
352 (masterUserId, _masterRootId, masterCorpusId)
353 <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
354
355 --let gp = (GroupParams l 2 3 (StopSize 3))
356 -- Here the PosTagAlgo should be chosen according to the Lang
357 _ <- case mfslw of
358 (Just (NoList _)) -> do
359 -- printDebug "Do not build list" mfslw
360 pure ()
361 _ -> do
362 ngs <- buildNgramsLists user userCorpusId masterCorpusId mfslw
363 $ GroupWithPosTag l server HashMap.empty
364
365 -- printDebug "flowCorpusUser:ngs" ngs
366
367 _userListId <- flowList_DbRepo listId ngs
368 _mastListId <- getOrMkList masterCorpusId masterUserId
369 pure ()
370 -- _ <- insertOccsUpdates userCorpusId mastListId
371 -- printDebug "userListId" userListId
372 --_ <- mkPhylo userCorpusId userId
373 -- Annuaire Flow
374 -- _ <- mkAnnuaire rootUserId userId
375 _ <- updateNgramsOccurrences userCorpusId (Just listId)
376
377 pure userCorpusId
378
379
380 insertMasterDocs :: ( FlowCmdM env err m
381 , FlowCorpus a
382 , MkCorpus c
383 )
384 => Maybe c
385 -> TermType Lang
386 -> [a]
387 -> m [DocId]
388 insertMasterDocs c lang hs = do
389 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
390 (ids', documentsWithId) <- insertDocs masterUserId masterCorpusId (map (toNode masterUserId Nothing) hs )
391 _ <- Doc.add masterCorpusId ids'
392 -- TODO
393 -- create a corpus with database name (CSV or PubMed)
394 -- add documents to the corpus (create node_node link)
395 -- this will enable global database monitoring
396
397 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
398 mapNgramsDocs' :: HashMap ExtractedNgrams (Map NgramsType (Map NodeId (Int, TermsCount)))
399 <- mapNodeIdNgrams
400 <$> documentIdWithNgrams
401 (extractNgramsT $ withLang lang documentsWithId)
402 documentsWithId
403
404 lId <- getOrMkList masterCorpusId masterUserId
405 -- _ <- saveDocNgramsWith lId mapNgramsDocs'
406 _ <- saveDocNgramsWith lId mapNgramsDocs'
407
408 -- _cooc <- insertDefaultNode NodeListCooc lId masterUserId
409 pure ids'
410
411 saveDocNgramsWith :: (FlowCmdM env err m)
412 => ListId
413 -> HashMap ExtractedNgrams (Map NgramsType (Map NodeId (Int, TermsCount)))
414 -> m ()
415 saveDocNgramsWith lId mapNgramsDocs' = do
416 --printDebug "[saveDocNgramsWith] mapNgramsDocs'" mapNgramsDocs'
417 let mapNgramsDocsNoCount = over (traverse . traverse . traverse) fst mapNgramsDocs'
418 terms2id <- insertExtractedNgrams $ HashMap.keys mapNgramsDocsNoCount
419
420 let mapNgramsDocs = HashMap.mapKeys extracted2ngrams mapNgramsDocs'
421
422 -- new
423 mapCgramsId <- listInsertDb lId toNodeNgramsW'
424 $ map (first _ngramsTerms . second Map.keys)
425 $ HashMap.toList mapNgramsDocs
426
427 --printDebug "saveDocNgramsWith" mapCgramsId
428 -- insertDocNgrams
429 let ngrams2insert = catMaybes [ ContextNodeNgrams2 <$> Just nId
430 <*> (getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms''))
431 <*> Just (fromIntegral w :: Double)
432 | (terms'', mapNgramsTypes) <- HashMap.toList mapNgramsDocs
433 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
434 , (nId, (w, _cnt)) <- Map.toList mapNodeIdWeight
435 ]
436 -- printDebug "Ngrams2Insert" ngrams2insert
437 _return <- insertContextNodeNgrams2 ngrams2insert
438
439 -- to be removed
440 _ <- insertDocNgrams lId $ HashMap.mapKeys (indexNgrams terms2id) mapNgramsDocs
441
442 pure ()
443
444
445 ------------------------------------------------------------------------
446 -- TODO Type NodeDocumentUnicised
447 insertDocs :: ( FlowCmdM env err m
448 -- , FlowCorpus a
449 , FlowInsertDB a
450 )
451 => UserId
452 -> CorpusId
453 -> [a]
454 -> m ([ContextId], [Indexed ContextId a])
455 insertDocs uId cId hs = do
456 let docs = map addUniqId hs
457 newIds <- insertDb uId Nothing docs
458 -- printDebug "newIds" newIds
459 let
460 newIds' = map reId newIds
461 documentsWithId = mergeData (toInserted newIds) (Map.fromList $ map viewUniqId' docs)
462 _ <- Doc.add cId newIds'
463 pure (newIds', documentsWithId)
464
465
466 ------------------------------------------------------------------------
467 viewUniqId' :: UniqId a
468 => a
469 -> (Hash, a)
470 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
471 where
472 err = panic "[ERROR] Database.Flow.toInsert"
473
474
475 toInserted :: [ReturnId]
476 -> Map Hash ReturnId
477 toInserted =
478 Map.fromList . map (\r -> (reUniqId r, r) )
479 . filter (\r -> reInserted r == True)
480
481 mergeData :: Map Hash ReturnId
482 -> Map Hash a
483 -> [Indexed NodeId a]
484 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
485 where
486 toDocumentWithId (sha,hpd) =
487 Indexed <$> fmap reId (lookup sha rs)
488 <*> Just hpd
489
490 ------------------------------------------------------------------------
491 ------------------------------------------------------------------------
492 ------------------------------------------------------------------------
493 documentIdWithNgrams :: HasNodeError err
494 => (a
495 -> Cmd err (HashMap b (Map NgramsType Int, TermsCount)))
496 -> [Indexed NodeId a]
497 -> Cmd err [DocumentIdWithNgrams a b]
498 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
499 where
500 toDocumentIdWithNgrams d = do
501 e <- f $ _unIndex d
502 pure $ DocumentIdWithNgrams d e
503
504
505 -- | TODO check optimization
506 mapNodeIdNgrams :: (Ord b, Hashable b)
507 => [DocumentIdWithNgrams a b]
508 -> HashMap b
509 (Map NgramsType
510 (Map NodeId (Int, TermsCount))
511 )
512 mapNodeIdNgrams = HashMap.unionsWith (Map.unionWith (Map.unionWith addTuples)) . fmap f
513 where
514 -- | NOTE We are somehow multiplying 'TermsCount' here: If the
515 -- same ngrams term has different ngrams types, the 'TermsCount'
516 -- for it (which is the number of times the terms appears in a
517 -- document) is copied over to all its types.
518 f :: DocumentIdWithNgrams a b
519 -> HashMap b (Map NgramsType (Map NodeId (Int, TermsCount)))
520 f d = fmap (\(ngramsTypeMap, cnt) -> fmap (\i -> Map.singleton nId (i, cnt)) ngramsTypeMap) $ documentNgrams d
521 where
522 nId = _index $ documentWithId d
523
524
525 ------------------------------------------------------------------------
526 instance ExtractNgramsT HyperdataContact
527 where
528 extractNgramsT l hc = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extract l hc
529 where
530 extract :: TermType Lang -> HyperdataContact
531 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int, TermsCount))
532 extract _l hc' = do
533 let authors = map text2ngrams
534 $ maybe ["Nothing"] (\a -> [a])
535 $ view (hc_who . _Just . cw_lastName) hc'
536
537 pure $ HashMap.fromList $ [(SimpleNgrams a', (Map.singleton Authors 1, 1)) | a' <- authors ]
538
539
540 instance ExtractNgramsT HyperdataDocument
541 where
542 extractNgramsT :: TermType Lang
543 -> HyperdataDocument
544 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int, TermsCount))
545 extractNgramsT lang hd = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extractNgramsT' lang hd
546 where
547 extractNgramsT' :: TermType Lang
548 -> HyperdataDocument
549 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int, TermsCount))
550 extractNgramsT' lang' doc = do
551 let source = text2ngrams
552 $ maybe "Nothing" identity
553 $ _hd_source doc
554
555 institutes = map text2ngrams
556 $ maybe ["Nothing"] (splitOn Institutes (doc^. hd_bdd))
557 $ _hd_institutes doc
558
559 authors = map text2ngrams
560 $ maybe ["Nothing"] (splitOn Authors (doc^. hd_bdd))
561 $ _hd_authors doc
562
563 ncs <- view (nlpServerGet $ lang' ^. tt_lang)
564
565 termsWithCounts' <- map (\(t, cnt) -> (enrichedTerms (lang' ^. tt_lang) CoreNLP NP t, cnt))
566 <$> concat
567 <$> liftBase (extractTerms ncs lang' $ hasText doc)
568
569 pure $ HashMap.fromList
570 $ [(SimpleNgrams source, (Map.singleton Sources 1, 1)) ]
571 <> [(SimpleNgrams i', (Map.singleton Institutes 1, 1)) | i' <- institutes ]
572 <> [(SimpleNgrams a', (Map.singleton Authors 1, 1)) | a' <- authors ]
573 <> [(EnrichedNgrams t', (Map.singleton NgramsTerms 1, cnt')) | (t', cnt') <- termsWithCounts' ]
574
575 instance (ExtractNgramsT a, HasText a) => ExtractNgramsT (Node a)
576 where
577 extractNgramsT l (Node { _node_hyperdata = h }) = extractNgramsT l h
578
579 instance HasText a => HasText (Node a)
580 where
581 hasText (Node { _node_hyperdata = h }) = hasText h
582
583
584
585 -- | TODO putelsewhere
586 -- | Upgrade function
587 -- Suppose all documents are English (this is the case actually)
588 indexAllDocumentsWithPosTag :: FlowCmdM env err m
589 => m ()
590 indexAllDocumentsWithPosTag = do
591 rootId <- getRootId (UserName userMaster)
592 corpusIds <- findNodesId rootId [NodeCorpus]
593 docs <- List.concat <$> mapM getDocumentsWithParentId corpusIds
594 _ <- mapM extractInsert (splitEvery 1000 docs)
595 pure ()
596
597 extractInsert :: FlowCmdM env err m
598 => [Node HyperdataDocument] -> m ()
599 extractInsert docs = do
600 let documentsWithId = map (\doc -> Indexed (doc ^. node_id) doc) docs
601 mapNgramsDocs' <- mapNodeIdNgrams
602 <$> documentIdWithNgrams
603 (extractNgramsT $ withLang (Multi EN) documentsWithId)
604 documentsWithId
605 _ <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
606 pure ()