]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Flow.hs
[pubmed] implement pubmed api key in hyperdata user
[gargantext.git] / src / Gargantext / Database / Action / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -- TODO-ACCESS:
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
13
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
16 -}
17
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
19
20 {-# LANGUAGE ConstrainedClassMethods #-}
21 {-# LANGUAGE ConstraintKinds #-}
22 {-# LANGUAGE InstanceSigs #-}
23 {-# LANGUAGE ScopedTypeVariables #-}
24 {-# LANGUAGE TemplateHaskell #-}
25
26 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
27 ( DataText(..)
28 , getDataText
29 , getDataText_Debug
30 , flowDataText
31 , flow
32
33 , flowCorpusFile
34 , flowCorpus
35 , flowCorpusUser
36 , flowAnnuaire
37 , insertMasterDocs
38 , saveDocNgramsWith
39
40 , getOrMkRoot
41 , getOrMk_RootWithCorpus
42 , TermType(..)
43 , DataOrigin(..)
44 , allDataOrigins
45
46 , do_api
47 , indexAllDocumentsWithPosTag
48 )
49 where
50
51 import Conduit
52 import Control.Lens ((^.), view, _Just, makeLenses, over, traverse)
53 import Data.Aeson.TH (deriveJSON)
54 import Data.Conduit.Internal (zipSources)
55 import qualified Data.Conduit.List as CList
56 import Data.Either
57 import Data.HashMap.Strict (HashMap)
58 import Data.Hashable (Hashable)
59 import Data.List (concat)
60 import Data.Map.Strict (Map, lookup)
61 import Data.Maybe (catMaybes)
62 import Data.Monoid
63 import Data.Swagger
64 import qualified Data.Text as T
65 import Data.Tuple.Extra (first, second)
66 import GHC.Generics (Generic)
67 import System.FilePath (FilePath)
68 import qualified Data.HashMap.Strict as HashMap
69 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
70 import qualified Data.Map.Strict as Map
71 import qualified Data.Conduit.List as CL
72 import qualified Data.Conduit as C
73
74 import Gargantext.Core (Lang(..), PosTagAlgo(..))
75 -- import Gargantext.Core.Ext.IMT (toSchoolName)
76 import Gargantext.Core.Ext.IMTUser (readFile_Annuaire)
77 import Gargantext.Core.Flow.Types
78 import Gargantext.Core.NLP (nlpServerGet)
79 import Gargantext.Core.Text
80 import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat, FileType, splitOn)
81 import Gargantext.Core.Text.List (buildNgramsLists)
82 import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
83 import Gargantext.Core.Text.List.Social (FlowSocialListWith(..))
84 import Gargantext.Core.Text.Terms
85 import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
86 import Gargantext.Core.Types (POS(NP), TermsCount)
87 import Gargantext.Core.Types.Individu (User(..))
88 import Gargantext.Core.Types.Main
89 import Gargantext.Core.Types.Query (Limit)
90 import Gargantext.Core.Utils (addTuples)
91 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
92 import Gargantext.Database.Action.Flow.List
93 import Gargantext.Database.Action.Flow.Types
94 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams, DocumentIdWithNgrams(..))
95 import Gargantext.Database.Action.Search (searchDocInDatabase)
96 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
97 import Gargantext.Database.Action.Metrics (updateNgramsOccurrences)
98 import Gargantext.Database.Admin.Types.Hyperdata
99 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
100 import Gargantext.Database.Prelude
101 import Gargantext.Database.Query.Table.ContextNodeNgrams2
102 import Gargantext.Database.Query.Table.Ngrams
103 import Gargantext.Database.Query.Table.Node
104 import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
105 import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
106 import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
107 import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
108 import Gargantext.Database.Schema.Node (NodePoly(..), node_id)
109 import Gargantext.Database.Types
110 import Gargantext.Prelude
111 import Gargantext.Prelude.Crypto.Hash (Hash)
112 import Gargantext.Utils.Jobs (JobHandle, MonadJobStatus(..))
113 import qualified Gargantext.Core.Text.Corpus.API as API
114 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
115 import qualified PUBMED.Types as PUBMED
116 --import qualified Prelude
117
118 ------------------------------------------------------------------------
119 -- Imports for upgrade function
120 import Gargantext.Database.Query.Tree.Root (getRootId)
121 import Gargantext.Database.Query.Tree (findNodesId)
122 import qualified Data.List as List
123 ------------------------------------------------------------------------
124 -- TODO use internal with API name (could be old data)
125 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
126 | ExternalOrigin { _do_api :: API.ExternalAPIs }
127 -- TODO Web
128 deriving (Generic, Eq)
129
130 makeLenses ''DataOrigin
131 deriveJSON (unPrefix "_do_") ''DataOrigin
132 instance ToSchema DataOrigin where
133 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
134
135 allDataOrigins :: [DataOrigin]
136 allDataOrigins = map InternalOrigin API.externalAPIs <> map ExternalOrigin API.externalAPIs
137
138 ---------------
139 data DataText = DataOld ![NodeId]
140 | DataNew !(Maybe Integer, ConduitT () HyperdataDocument IO ())
141 --- | DataNew ![[HyperdataDocument]]
142
143 -- Show instance is not possible because of IO
144 printDataText :: DataText -> IO ()
145 printDataText (DataOld xs) = putStrLn $ show xs
146 printDataText (DataNew (maybeInt, conduitData)) = do
147 res <- C.runConduit (conduitData .| CL.consume)
148 putStrLn $ show (maybeInt, res)
149
150 -- TODO use the split parameter in config file
151 getDataText :: FlowCmdM env err m
152 => DataOrigin
153 -> TermType Lang
154 -> API.RawQuery
155 -> Maybe PUBMED.APIKey
156 -> Maybe API.Limit
157 -> m (Either API.GetCorpusError DataText)
158 getDataText (ExternalOrigin api) la q mPubmedAPIKey li = do
159 eRes <- liftBase $ API.get api (_tt_lang la) q mPubmedAPIKey li
160 pure $ DataNew <$> eRes
161 getDataText (InternalOrigin _) _la q _ _li = do
162 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
163 (UserName userMaster)
164 (Left "")
165 (Nothing :: Maybe HyperdataCorpus)
166 ids <- map fst <$> searchDocInDatabase cId (stemIt $ API.getRawQuery q)
167 pure $ Right $ DataOld ids
168
169 getDataText_Debug :: FlowCmdM env err m
170 => DataOrigin
171 -> TermType Lang
172 -> API.RawQuery
173 -> Maybe API.Limit
174 -> m ()
175 getDataText_Debug a l q li = do
176 result <- getDataText a l q Nothing li
177 case result of
178 Left err -> liftBase $ putStrLn $ show err
179 Right res -> liftBase $ printDataText res
180
181
182 -------------------------------------------------------------------------------
183 flowDataText :: forall env err m.
184 ( FlowCmdM env err m
185 , MonadJobStatus m
186 )
187 => User
188 -> DataText
189 -> TermType Lang
190 -> CorpusId
191 -> Maybe FlowSocialListWith
192 -> JobHandle m
193 -> m CorpusId
194 flowDataText u (DataOld ids) tt cid mfslw _ = do
195 (_userId, userCorpusId, listId) <- createNodes u (Right [cid]) corpusType
196 _ <- Doc.add userCorpusId ids
197 flowCorpusUser (_tt_lang tt) u userCorpusId listId corpusType mfslw
198 where
199 corpusType = (Nothing :: Maybe HyperdataCorpus)
200 flowDataText u (DataNew (mLen, txtC)) tt cid mfslw jobHandle =
201 flowCorpus u (Right [cid]) tt mfslw (mLen, (transPipe liftBase txtC)) jobHandle
202
203 ------------------------------------------------------------------------
204 -- TODO use proxy
205 flowAnnuaire :: (FlowCmdM env err m, MonadJobStatus m)
206 => User
207 -> Either CorpusName [CorpusId]
208 -> (TermType Lang)
209 -> FilePath
210 -> JobHandle m
211 -> m AnnuaireId
212 flowAnnuaire u n l filePath jobHandle = do
213 -- TODO Conduit for file
214 docs <- liftBase $ ((readFile_Annuaire filePath) :: IO [HyperdataContact])
215 flow (Nothing :: Maybe HyperdataAnnuaire) u n l Nothing (Just $ fromIntegral $ length docs, yieldMany docs) jobHandle
216
217 ------------------------------------------------------------------------
218 flowCorpusFile :: (FlowCmdM env err m, MonadJobStatus m)
219 => User
220 -> Either CorpusName [CorpusId]
221 -> Limit -- Limit the number of docs (for dev purpose)
222 -> TermType Lang
223 -> FileType
224 -> FileFormat
225 -> FilePath
226 -> Maybe FlowSocialListWith
227 -> JobHandle m
228 -> m CorpusId
229 flowCorpusFile u n _l la ft ff fp mfslw jobHandle = do
230 eParsed <- liftBase $ parseFile ft ff fp
231 case eParsed of
232 Right parsed -> do
233 flowCorpus u n la mfslw (Just $ fromIntegral $ length parsed, yieldMany parsed .| mapC toHyperdataDocument) jobHandle
234 --let docs = splitEvery 500 $ take l parsed
235 --flowCorpus u n la mfslw (yieldMany $ map (map toHyperdataDocument) docs) logStatus
236 Left e -> panic $ "Error: " <> T.pack e
237
238 ------------------------------------------------------------------------
239 -- | TODO improve the needed type to create/update a corpus
240 -- (For now, Either is enough)
241 flowCorpus :: (FlowCmdM env err m, FlowCorpus a, MonadJobStatus m)
242 => User
243 -> Either CorpusName [CorpusId]
244 -> TermType Lang
245 -> Maybe FlowSocialListWith
246 -> (Maybe Integer, ConduitT () a m ())
247 -> JobHandle m
248 -> m CorpusId
249 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
250
251
252 flow :: forall env err m a c.
253 ( FlowCmdM env err m
254 , FlowCorpus a
255 , MkCorpus c
256 , MonadJobStatus m
257 )
258 => Maybe c
259 -> User
260 -> Either CorpusName [CorpusId]
261 -> TermType Lang
262 -> Maybe FlowSocialListWith
263 -> (Maybe Integer, ConduitT () a m ())
264 -> JobHandle m
265 -> m CorpusId
266 flow c u cn la mfslw (mLength, docsC) jobHandle = do
267 (_userId, userCorpusId, listId) <- createNodes u cn c
268 -- TODO if public insertMasterDocs else insertUserDocs
269 _ <- runConduit $ zipSources (yieldMany [1..]) docsC
270 .| CList.chunksOf 100
271 .| mapMC insertDocs'
272 .| mapM_C (\ids' -> do
273 _ <- Doc.add userCorpusId ids'
274 pure ())
275 .| sinkList
276
277 _ <- flowCorpusUser (la ^. tt_lang) u userCorpusId listId c mfslw
278
279 -- ids <- traverse (\(idx, doc) -> do
280 -- id <- insertMasterDocs c la doc
281 -- logStatus JobLog { _scst_succeeded = Just $ 1 + idx
282 -- , _scst_failed = Just 0
283 -- , _scst_remaining = Just $ length docs - idx
284 -- , _scst_events = Just []
285 -- }
286 -- pure id
287 -- ) (zip [1..] docs)
288 --printDebug "[flow] calling flowCorpusUser" (0 :: Int)
289 pure userCorpusId
290 --flowCorpusUser (la ^. tt_lang) u cn c ids mfslw
291
292 where
293 insertDocs' :: [(Integer, a)] -> m [NodeId]
294 insertDocs' [] = pure []
295 insertDocs' docs = do
296 -- printDebug "[flow] calling insertDoc, ([idx], mLength) = " (fst <$> docs, mLength)
297 ids <- insertMasterDocs c la (snd <$> docs)
298 let maxIdx = maximum (fst <$> docs)
299 case mLength of
300 Nothing -> pure ()
301 Just _len -> do
302
303 let succeeded = fromIntegral (1 + maxIdx)
304 -- let remaining = fromIntegral (len - maxIdx)
305 -- Reconstruct the correct update state by using 'markStarted' and the other primitives.
306 -- We do this slightly awkward arithmetic such that when we call 'markProgress' we reduce
307 -- the number of 'remaining' of exactly '1 + maxIdx', and we will end up with a 'JobLog'
308 -- looking like this:
309 -- JobLog
310 -- { _scst_succeeded = Just $ fromIntegral $ 1 + maxIdx
311 -- , _scst_failed = Just 0
312 -- , _scst_remaining = Just $ fromIntegral $ len - maxIdx
313 -- , _scst_events = Just []
314 -- }
315 -- markStarted (remaining + succeeded) jobHandle
316 markProgress succeeded jobHandle
317
318 pure ids
319
320
321
322 ------------------------------------------------------------------------
323 createNodes :: ( FlowCmdM env err m
324 , MkCorpus c
325 )
326 => User
327 -> Either CorpusName [CorpusId]
328 -> Maybe c
329 -> m (UserId, CorpusId, ListId)
330 createNodes user corpusName ctype = do
331 -- User Flow
332 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
333 -- NodeTexts is first
334 _tId <- insertDefaultNodeIfNotExists NodeTexts userCorpusId userId
335 -- printDebug "NodeTexts: " tId
336
337 -- NodeList is second
338 listId <- getOrMkList userCorpusId userId
339
340 -- User Graph Flow
341 _ <- insertDefaultNodeIfNotExists NodeGraph userCorpusId userId
342 _ <- insertDefaultNodeIfNotExists NodeDashboard userCorpusId userId
343
344 pure (userId, userCorpusId, listId)
345
346
347 flowCorpusUser :: ( FlowCmdM env err m
348 , MkCorpus c
349 )
350 => Lang
351 -> User
352 -> CorpusId
353 -> ListId
354 -> Maybe c
355 -> Maybe FlowSocialListWith
356 -> m CorpusId
357 flowCorpusUser l user userCorpusId listId ctype mfslw = do
358 server <- view (nlpServerGet l)
359 -- User List Flow
360 (masterUserId, _masterRootId, masterCorpusId)
361 <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
362
363 --let gp = (GroupParams l 2 3 (StopSize 3))
364 -- Here the PosTagAlgo should be chosen according to the Lang
365 _ <- case mfslw of
366 (Just (NoList _)) -> do
367 -- printDebug "Do not build list" mfslw
368 pure ()
369 _ -> do
370 ngs <- buildNgramsLists user userCorpusId masterCorpusId mfslw
371 $ GroupWithPosTag l server HashMap.empty
372
373 -- printDebug "flowCorpusUser:ngs" ngs
374
375 _userListId <- flowList_DbRepo listId ngs
376 _mastListId <- getOrMkList masterCorpusId masterUserId
377 pure ()
378 -- _ <- insertOccsUpdates userCorpusId mastListId
379 -- printDebug "userListId" userListId
380 --_ <- mkPhylo userCorpusId userId
381 -- Annuaire Flow
382 -- _ <- mkAnnuaire rootUserId userId
383 _ <- updateNgramsOccurrences userCorpusId (Just listId)
384
385 pure userCorpusId
386
387
388 insertMasterDocs :: ( FlowCmdM env err m
389 , FlowCorpus a
390 , MkCorpus c
391 )
392 => Maybe c
393 -> TermType Lang
394 -> [a]
395 -> m [DocId]
396 insertMasterDocs c lang hs = do
397 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
398 (ids', documentsWithId) <- insertDocs masterUserId masterCorpusId (map (toNode masterUserId Nothing) hs )
399 _ <- Doc.add masterCorpusId ids'
400 -- TODO
401 -- create a corpus with database name (CSV or PubMed)
402 -- add documents to the corpus (create node_node link)
403 -- this will enable global database monitoring
404
405 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
406 mapNgramsDocs' :: HashMap ExtractedNgrams (Map NgramsType (Map NodeId (Int, TermsCount)))
407 <- mapNodeIdNgrams
408 <$> documentIdWithNgrams
409 (extractNgramsT $ withLang lang documentsWithId)
410 documentsWithId
411
412 lId <- getOrMkList masterCorpusId masterUserId
413 -- _ <- saveDocNgramsWith lId mapNgramsDocs'
414 _ <- saveDocNgramsWith lId mapNgramsDocs'
415
416 -- _cooc <- insertDefaultNode NodeListCooc lId masterUserId
417 pure ids'
418
419 saveDocNgramsWith :: (FlowCmdM env err m)
420 => ListId
421 -> HashMap ExtractedNgrams (Map NgramsType (Map NodeId (Int, TermsCount)))
422 -> m ()
423 saveDocNgramsWith lId mapNgramsDocs' = do
424 --printDebug "[saveDocNgramsWith] mapNgramsDocs'" mapNgramsDocs'
425 let mapNgramsDocsNoCount = over (traverse . traverse . traverse) fst mapNgramsDocs'
426 terms2id <- insertExtractedNgrams $ HashMap.keys mapNgramsDocsNoCount
427
428 let mapNgramsDocs = HashMap.mapKeys extracted2ngrams mapNgramsDocs'
429
430 -- new
431 mapCgramsId <- listInsertDb lId toNodeNgramsW'
432 $ map (first _ngramsTerms . second Map.keys)
433 $ HashMap.toList mapNgramsDocs
434
435 --printDebug "saveDocNgramsWith" mapCgramsId
436 -- insertDocNgrams
437 let ngrams2insert = catMaybes [ ContextNodeNgrams2 <$> Just nId
438 <*> (getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms''))
439 <*> Just (fromIntegral w :: Double)
440 | (terms'', mapNgramsTypes) <- HashMap.toList mapNgramsDocs
441 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
442 , (nId, (w, _cnt)) <- Map.toList mapNodeIdWeight
443 ]
444 -- printDebug "Ngrams2Insert" ngrams2insert
445 _return <- insertContextNodeNgrams2 ngrams2insert
446
447 -- to be removed
448 _ <- insertDocNgrams lId $ HashMap.mapKeys (indexNgrams terms2id) mapNgramsDocs
449
450 pure ()
451
452
453 ------------------------------------------------------------------------
454 -- TODO Type NodeDocumentUnicised
455 insertDocs :: ( FlowCmdM env err m
456 -- , FlowCorpus a
457 , FlowInsertDB a
458 )
459 => UserId
460 -> CorpusId
461 -> [a]
462 -> m ([ContextId], [Indexed ContextId a])
463 insertDocs uId cId hs = do
464 let docs = map addUniqId hs
465 newIds <- insertDb uId Nothing docs
466 -- printDebug "newIds" newIds
467 let
468 newIds' = map reId newIds
469 documentsWithId = mergeData (toInserted newIds) (Map.fromList $ map viewUniqId' docs)
470 _ <- Doc.add cId newIds'
471 pure (newIds', documentsWithId)
472
473
474 ------------------------------------------------------------------------
475 viewUniqId' :: UniqId a
476 => a
477 -> (Hash, a)
478 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
479 where
480 err = panic "[ERROR] Database.Flow.toInsert"
481
482
483 toInserted :: [ReturnId]
484 -> Map Hash ReturnId
485 toInserted =
486 Map.fromList . map (\r -> (reUniqId r, r) )
487 . filter (\r -> reInserted r == True)
488
489 mergeData :: Map Hash ReturnId
490 -> Map Hash a
491 -> [Indexed NodeId a]
492 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
493 where
494 toDocumentWithId (sha,hpd) =
495 Indexed <$> fmap reId (lookup sha rs)
496 <*> Just hpd
497
498 ------------------------------------------------------------------------
499 ------------------------------------------------------------------------
500 ------------------------------------------------------------------------
501 documentIdWithNgrams :: HasNodeError err
502 => (a
503 -> Cmd err (HashMap b (Map NgramsType Int, TermsCount)))
504 -> [Indexed NodeId a]
505 -> Cmd err [DocumentIdWithNgrams a b]
506 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
507 where
508 toDocumentIdWithNgrams d = do
509 e <- f $ _unIndex d
510 pure $ DocumentIdWithNgrams d e
511
512
513 -- | TODO check optimization
514 mapNodeIdNgrams :: (Ord b, Hashable b)
515 => [DocumentIdWithNgrams a b]
516 -> HashMap b
517 (Map NgramsType
518 (Map NodeId (Int, TermsCount))
519 )
520 mapNodeIdNgrams = HashMap.unionsWith (Map.unionWith (Map.unionWith addTuples)) . fmap f
521 where
522 -- | NOTE We are somehow multiplying 'TermsCount' here: If the
523 -- same ngrams term has different ngrams types, the 'TermsCount'
524 -- for it (which is the number of times the terms appears in a
525 -- document) is copied over to all its types.
526 f :: DocumentIdWithNgrams a b
527 -> HashMap b (Map NgramsType (Map NodeId (Int, TermsCount)))
528 f d = fmap (\(ngramsTypeMap, cnt) -> fmap (\i -> Map.singleton nId (i, cnt)) ngramsTypeMap) $ documentNgrams d
529 where
530 nId = _index $ documentWithId d
531
532
533 ------------------------------------------------------------------------
534 instance ExtractNgramsT HyperdataContact
535 where
536 extractNgramsT l hc = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extract l hc
537 where
538 extract :: TermType Lang -> HyperdataContact
539 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int, TermsCount))
540 extract _l hc' = do
541 let authors = map text2ngrams
542 $ maybe ["Nothing"] (\a -> [a])
543 $ view (hc_who . _Just . cw_lastName) hc'
544
545 pure $ HashMap.fromList $ [(SimpleNgrams a', (Map.singleton Authors 1, 1)) | a' <- authors ]
546
547
548 instance ExtractNgramsT HyperdataDocument
549 where
550 extractNgramsT :: TermType Lang
551 -> HyperdataDocument
552 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int, TermsCount))
553 extractNgramsT lang hd = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extractNgramsT' lang hd
554 where
555 extractNgramsT' :: TermType Lang
556 -> HyperdataDocument
557 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int, TermsCount))
558 extractNgramsT' lang' doc = do
559 let source = text2ngrams
560 $ maybe "Nothing" identity
561 $ _hd_source doc
562
563 institutes = map text2ngrams
564 $ maybe ["Nothing"] (splitOn Institutes (doc^. hd_bdd))
565 $ _hd_institutes doc
566
567 authors = map text2ngrams
568 $ maybe ["Nothing"] (splitOn Authors (doc^. hd_bdd))
569 $ _hd_authors doc
570
571 ncs <- view (nlpServerGet $ lang' ^. tt_lang)
572
573 termsWithCounts' <- map (\(t, cnt) -> (enrichedTerms (lang' ^. tt_lang) CoreNLP NP t, cnt))
574 <$> concat
575 <$> liftBase (extractTerms ncs lang' $ hasText doc)
576
577 pure $ HashMap.fromList
578 $ [(SimpleNgrams source, (Map.singleton Sources 1, 1)) ]
579 <> [(SimpleNgrams i', (Map.singleton Institutes 1, 1)) | i' <- institutes ]
580 <> [(SimpleNgrams a', (Map.singleton Authors 1, 1)) | a' <- authors ]
581 <> [(EnrichedNgrams t', (Map.singleton NgramsTerms 1, cnt')) | (t', cnt') <- termsWithCounts' ]
582
583 instance (ExtractNgramsT a, HasText a) => ExtractNgramsT (Node a)
584 where
585 extractNgramsT l (Node { _node_hyperdata = h }) = extractNgramsT l h
586
587 instance HasText a => HasText (Node a)
588 where
589 hasText (Node { _node_hyperdata = h }) = hasText h
590
591
592
593 -- | TODO putelsewhere
594 -- | Upgrade function
595 -- Suppose all documents are English (this is the case actually)
596 indexAllDocumentsWithPosTag :: FlowCmdM env err m
597 => m ()
598 indexAllDocumentsWithPosTag = do
599 rootId <- getRootId (UserName userMaster)
600 corpusIds <- findNodesId rootId [NodeCorpus]
601 docs <- List.concat <$> mapM getDocumentsWithParentId corpusIds
602 _ <- mapM extractInsert (splitEvery 1000 docs)
603 pure ()
604
605 extractInsert :: FlowCmdM env err m
606 => [Node HyperdataDocument] -> m ()
607 extractInsert docs = do
608 let documentsWithId = map (\doc -> Indexed (doc ^. node_id) doc) docs
609 mapNgramsDocs' <- mapNodeIdNgrams
610 <$> documentIdWithNgrams
611 (extractNgramsT $ withLang (Multi EN) documentsWithId)
612 documentsWithId
613 _ <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
614 pure ()