]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Flow.hs
[FEAT] Iramuteq parser (WIP)
[gargantext.git] / src / Gargantext / Database / Action / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -- TODO-ACCESS:
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
13
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
16 -}
17
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
19
20 {-# LANGUAGE ConstrainedClassMethods #-}
21 {-# LANGUAGE ConstraintKinds #-}
22 {-# LANGUAGE InstanceSigs #-}
23 {-# LANGUAGE ScopedTypeVariables #-}
24 {-# LANGUAGE TemplateHaskell #-}
25
26 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
27 ( DataText(..)
28 , getDataText
29 , getDataText_Debug
30 , flowDataText
31 , flow
32
33 , flowCorpusFile
34 , flowCorpus
35 , flowCorpusUser
36 , flowAnnuaire
37 , insertMasterDocs
38 , saveDocNgramsWith
39
40 , getOrMkRoot
41 , getOrMk_RootWithCorpus
42 , TermType(..)
43 , DataOrigin(..)
44 , allDataOrigins
45
46 , do_api
47 , indexAllDocumentsWithPosTag
48 )
49 where
50
51 import Conduit
52 import Control.Lens ((^.), view, _Just, makeLenses, over, traverse)
53 import Control.Monad.Reader (MonadReader)
54 import Data.Aeson.TH (deriveJSON)
55 import Data.Conduit.Internal (zipSources)
56 import qualified Data.Conduit.List as CList
57 import Data.Either
58 import Data.HashMap.Strict (HashMap)
59 import Data.Hashable (Hashable)
60 import Data.List (concat)
61 import Data.Map.Strict (Map, lookup)
62 import Data.Maybe (catMaybes)
63 import Data.Monoid
64 import Data.Swagger
65 import qualified Data.Text as T
66 import Data.Tuple.Extra (first, second)
67 import GHC.Generics (Generic)
68 import Servant.Client (ClientError)
69 import System.FilePath (FilePath)
70 import qualified Data.HashMap.Strict as HashMap
71 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
72 import qualified Data.Map.Strict as Map
73 import qualified Data.Conduit.List as CL
74 import qualified Data.Conduit as C
75
76 import Gargantext.Core (Lang(..), PosTagAlgo(..))
77 -- import Gargantext.Core.Ext.IMT (toSchoolName)
78 import Gargantext.Core.Ext.IMTUser (readFile_Annuaire)
79 import Gargantext.Core.Flow.Types
80 import Gargantext.Core.NLP (nlpServerGet)
81 import Gargantext.Core.Text
82 import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat, FileType, splitOn)
83 import Gargantext.Core.Text.List (buildNgramsLists)
84 import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
85 import Gargantext.Core.Text.List.Social (FlowSocialListWith(..))
86 import Gargantext.Core.Text.Terms
87 import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
88 import Gargantext.Core.Types (POS(NP), TermsCount)
89 import Gargantext.Core.Types.Individu (User(..))
90 import Gargantext.Core.Types.Main
91 import Gargantext.Core.Types.Query (Limit)
92 import Gargantext.Core.Utils (addTuples)
93 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
94 import Gargantext.Database.Action.Flow.List
95 import Gargantext.Database.Action.Flow.Types
96 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams, DocumentIdWithNgrams(..))
97 import Gargantext.Database.Action.Search (searchDocInDatabase)
98 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
99 import Gargantext.Database.Action.Metrics (updateNgramsOccurrences)
100 import Gargantext.Database.Admin.Types.Hyperdata
101 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
102 import Gargantext.Database.Prelude
103 import Gargantext.Database.Query.Table.ContextNodeNgrams2
104 import Gargantext.Database.Query.Table.Ngrams
105 import Gargantext.Database.Query.Table.Node
106 import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
107 import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
108 import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
109 import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
110 import Gargantext.Database.Schema.Node (NodePoly(..), node_id)
111 import Gargantext.Database.Types
112 import Gargantext.Prelude
113 import Gargantext.Prelude.Crypto.Hash (Hash)
114 import Gargantext.Utils.Jobs (JobHandle, MonadJobStatus(..))
115 import qualified Gargantext.Core.Text.Corpus.API as API
116 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
117 --import qualified Prelude
118
119 ------------------------------------------------------------------------
120 -- Imports for upgrade function
121 import Gargantext.Database.Query.Tree.Root (getRootId)
122 import Gargantext.Database.Query.Tree (findNodesId)
123 import qualified Data.List as List
124 ------------------------------------------------------------------------
125 -- TODO use internal with API name (could be old data)
126 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
127 | ExternalOrigin { _do_api :: API.ExternalAPIs }
128 -- TODO Web
129 deriving (Generic, Eq)
130
131 makeLenses ''DataOrigin
132 deriveJSON (unPrefix "_do_") ''DataOrigin
133 instance ToSchema DataOrigin where
134 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
135
136 allDataOrigins :: ( MonadReader env m
137 , HasConfig env) => m [DataOrigin]
138 allDataOrigins = do
139 ext <- API.externalAPIs
140
141 pure $ map InternalOrigin ext
142 <> map ExternalOrigin ext
143
144 ---------------
145 data DataText = DataOld ![NodeId]
146 | DataNew !(Maybe Integer, ConduitT () HyperdataDocument IO ())
147 --- | DataNew ![[HyperdataDocument]]
148
149 -- Show instance is not possible because of IO
150 printDataText :: DataText -> IO ()
151 printDataText (DataOld xs) = putStrLn $ show xs
152 printDataText (DataNew (maybeInt, conduitData)) = do
153 res <- C.runConduit (conduitData .| CL.consume)
154 putStrLn $ show (maybeInt, res)
155
156 -- TODO use the split parameter in config file
157 getDataText :: FlowCmdM env err m
158 => DataOrigin
159 -> TermType Lang
160 -> API.Query
161 -> Maybe API.Limit
162 -> m (Either ClientError DataText)
163 getDataText (ExternalOrigin api) la q li = liftBase $ do
164 eRes <- API.get api (_tt_lang la) q li
165 pure $ DataNew <$> eRes
166
167 getDataText (InternalOrigin _) _la q _li = do
168 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
169 (UserName userMaster)
170 (Left "")
171 (Nothing :: Maybe HyperdataCorpus)
172 ids <- map fst <$> searchDocInDatabase cId (stemIt q)
173 pure $ Right $ DataOld ids
174
175 getDataText_Debug :: FlowCmdM env err m
176 => DataOrigin
177 -> TermType Lang
178 -> API.Query
179 -> Maybe API.Limit
180 -> m ()
181 getDataText_Debug a l q li = do
182 result <- getDataText a l q li
183 case result of
184 Left err -> liftBase $ putStrLn $ show err
185 Right res -> liftBase $ printDataText res
186
187
188 -------------------------------------------------------------------------------
189 flowDataText :: forall env err m.
190 ( FlowCmdM env err m
191 , MonadJobStatus m
192 )
193 => User
194 -> DataText
195 -> TermType Lang
196 -> CorpusId
197 -> Maybe FlowSocialListWith
198 -> JobHandle m
199 -> m CorpusId
200 flowDataText u (DataOld ids) tt cid mfslw _ = do
201 (_userId, userCorpusId, listId) <- createNodes u (Right [cid]) corpusType
202 _ <- Doc.add userCorpusId ids
203 flowCorpusUser (_tt_lang tt) u userCorpusId listId corpusType mfslw
204 where
205 corpusType = (Nothing :: Maybe HyperdataCorpus)
206 flowDataText u (DataNew (mLen, txtC)) tt cid mfslw jobHandle =
207 flowCorpus u (Right [cid]) tt mfslw (mLen, (transPipe liftBase txtC)) jobHandle
208
209 ------------------------------------------------------------------------
210 -- TODO use proxy
211 flowAnnuaire :: (FlowCmdM env err m, MonadJobStatus m)
212 => User
213 -> Either CorpusName [CorpusId]
214 -> (TermType Lang)
215 -> FilePath
216 -> JobHandle m
217 -> m AnnuaireId
218 flowAnnuaire u n l filePath jobHandle = do
219 -- TODO Conduit for file
220 docs <- liftBase $ ((readFile_Annuaire filePath) :: IO [HyperdataContact])
221 flow (Nothing :: Maybe HyperdataAnnuaire) u n l Nothing (Just $ fromIntegral $ length docs, yieldMany docs) jobHandle
222
223 ------------------------------------------------------------------------
224 flowCorpusFile :: (FlowCmdM env err m, MonadJobStatus m)
225 => User
226 -> Either CorpusName [CorpusId]
227 -> Limit -- Limit the number of docs (for dev purpose)
228 -> TermType Lang
229 -> FileType
230 -> FileFormat
231 -> FilePath
232 -> Maybe FlowSocialListWith
233 -> JobHandle m
234 -> m CorpusId
235 flowCorpusFile u n _l la ft ff fp mfslw jobHandle = do
236 eParsed <- liftBase $ parseFile ft ff fp
237 case eParsed of
238 Right parsed -> do
239 flowCorpus u n la mfslw (Just $ fromIntegral $ length parsed, yieldMany parsed .| mapC toHyperdataDocument) jobHandle
240 --let docs = splitEvery 500 $ take l parsed
241 --flowCorpus u n la mfslw (yieldMany $ map (map toHyperdataDocument) docs) logStatus
242 Left e -> panic $ "Error: " <> T.pack e
243
244 ------------------------------------------------------------------------
245 -- | TODO improve the needed type to create/update a corpus
246 -- (For now, Either is enough)
247 flowCorpus :: (FlowCmdM env err m, FlowCorpus a, MonadJobStatus m)
248 => User
249 -> Either CorpusName [CorpusId]
250 -> TermType Lang
251 -> Maybe FlowSocialListWith
252 -> (Maybe Integer, ConduitT () a m ())
253 -> JobHandle m
254 -> m CorpusId
255 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
256
257
258 flow :: forall env err m a c.
259 ( FlowCmdM env err m
260 , FlowCorpus a
261 , MkCorpus c
262 , MonadJobStatus m
263 )
264 => Maybe c
265 -> User
266 -> Either CorpusName [CorpusId]
267 -> TermType Lang
268 -> Maybe FlowSocialListWith
269 -> (Maybe Integer, ConduitT () a m ())
270 -> JobHandle m
271 -> m CorpusId
272 flow c u cn la mfslw (mLength, docsC) jobHandle = do
273 (_userId, userCorpusId, listId) <- createNodes u cn c
274 -- TODO if public insertMasterDocs else insertUserDocs
275 _ <- runConduit $ zipSources (yieldMany [1..]) docsC
276 .| CList.chunksOf 100
277 .| mapMC insertDocs'
278 .| mapM_C (\ids' -> do
279 _ <- Doc.add userCorpusId ids'
280 pure ())
281 .| sinkList
282
283 _ <- flowCorpusUser (la ^. tt_lang) u userCorpusId listId c mfslw
284
285 -- ids <- traverse (\(idx, doc) -> do
286 -- id <- insertMasterDocs c la doc
287 -- logStatus JobLog { _scst_succeeded = Just $ 1 + idx
288 -- , _scst_failed = Just 0
289 -- , _scst_remaining = Just $ length docs - idx
290 -- , _scst_events = Just []
291 -- }
292 -- pure id
293 -- ) (zip [1..] docs)
294 --printDebug "[flow] calling flowCorpusUser" (0 :: Int)
295 pure userCorpusId
296 --flowCorpusUser (la ^. tt_lang) u cn c ids mfslw
297
298 where
299 insertDocs' :: [(Integer, a)] -> m [NodeId]
300 insertDocs' [] = pure []
301 insertDocs' docs = do
302 -- printDebug "[flow] calling insertDoc, ([idx], mLength) = " (fst <$> docs, mLength)
303 ids <- insertMasterDocs c la (snd <$> docs)
304 let maxIdx = maximum (fst <$> docs)
305 case mLength of
306 Nothing -> pure ()
307 Just _len -> do
308
309 let succeeded = fromIntegral (1 + maxIdx)
310 -- let remaining = fromIntegral (len - maxIdx)
311 -- Reconstruct the correct update state by using 'markStarted' and the other primitives.
312 -- We do this slightly awkward arithmetic such that when we call 'markProgress' we reduce
313 -- the number of 'remaining' of exactly '1 + maxIdx', and we will end up with a 'JobLog'
314 -- looking like this:
315 -- JobLog
316 -- { _scst_succeeded = Just $ fromIntegral $ 1 + maxIdx
317 -- , _scst_failed = Just 0
318 -- , _scst_remaining = Just $ fromIntegral $ len - maxIdx
319 -- , _scst_events = Just []
320 -- }
321 -- markStarted (remaining + succeeded) jobHandle
322 markProgress succeeded jobHandle
323
324 pure ids
325
326
327
328 ------------------------------------------------------------------------
329 createNodes :: ( FlowCmdM env err m
330 , MkCorpus c
331 )
332 => User
333 -> Either CorpusName [CorpusId]
334 -> Maybe c
335 -> m (UserId, CorpusId, ListId)
336 createNodes user corpusName ctype = do
337 -- User Flow
338 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
339 -- NodeTexts is first
340 _tId <- insertDefaultNodeIfNotExists NodeTexts userCorpusId userId
341 -- printDebug "NodeTexts: " tId
342
343 -- NodeList is second
344 listId <- getOrMkList userCorpusId userId
345
346 -- User Graph Flow
347 _ <- insertDefaultNodeIfNotExists NodeGraph userCorpusId userId
348 _ <- insertDefaultNodeIfNotExists NodeDashboard userCorpusId userId
349
350 pure (userId, userCorpusId, listId)
351
352
353 flowCorpusUser :: ( FlowCmdM env err m
354 , MkCorpus c
355 )
356 => Lang
357 -> User
358 -> CorpusId
359 -> ListId
360 -> Maybe c
361 -> Maybe FlowSocialListWith
362 -> m CorpusId
363 flowCorpusUser l user userCorpusId listId ctype mfslw = do
364 server <- view (nlpServerGet l)
365 -- User List Flow
366 (masterUserId, _masterRootId, masterCorpusId)
367 <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
368
369 --let gp = (GroupParams l 2 3 (StopSize 3))
370 -- Here the PosTagAlgo should be chosen according to the Lang
371 _ <- case mfslw of
372 (Just (NoList _)) -> do
373 -- printDebug "Do not build list" mfslw
374 pure ()
375 _ -> do
376 ngs <- buildNgramsLists user userCorpusId masterCorpusId mfslw
377 $ GroupWithPosTag l server HashMap.empty
378
379 -- printDebug "flowCorpusUser:ngs" ngs
380
381 _userListId <- flowList_DbRepo listId ngs
382 _mastListId <- getOrMkList masterCorpusId masterUserId
383 pure ()
384 -- _ <- insertOccsUpdates userCorpusId mastListId
385 -- printDebug "userListId" userListId
386 --_ <- mkPhylo userCorpusId userId
387 -- Annuaire Flow
388 -- _ <- mkAnnuaire rootUserId userId
389 _ <- updateNgramsOccurrences userCorpusId (Just listId)
390
391 pure userCorpusId
392
393
394 insertMasterDocs :: ( FlowCmdM env err m
395 , FlowCorpus a
396 , MkCorpus c
397 )
398 => Maybe c
399 -> TermType Lang
400 -> [a]
401 -> m [DocId]
402 insertMasterDocs c lang hs = do
403 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
404 (ids', documentsWithId) <- insertDocs masterUserId masterCorpusId (map (toNode masterUserId Nothing) hs )
405 _ <- Doc.add masterCorpusId ids'
406 -- TODO
407 -- create a corpus with database name (CSV or PubMed)
408 -- add documents to the corpus (create node_node link)
409 -- this will enable global database monitoring
410
411 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
412 mapNgramsDocs' :: HashMap ExtractedNgrams (Map NgramsType (Map NodeId (Int, TermsCount)))
413 <- mapNodeIdNgrams
414 <$> documentIdWithNgrams
415 (extractNgramsT $ withLang lang documentsWithId)
416 documentsWithId
417
418 lId <- getOrMkList masterCorpusId masterUserId
419 -- _ <- saveDocNgramsWith lId mapNgramsDocs'
420 _ <- saveDocNgramsWith lId mapNgramsDocs'
421
422 -- _cooc <- insertDefaultNode NodeListCooc lId masterUserId
423 pure ids'
424
425 saveDocNgramsWith :: (FlowCmdM env err m)
426 => ListId
427 -> HashMap ExtractedNgrams (Map NgramsType (Map NodeId (Int, TermsCount)))
428 -> m ()
429 saveDocNgramsWith lId mapNgramsDocs' = do
430 --printDebug "[saveDocNgramsWith] mapNgramsDocs'" mapNgramsDocs'
431 let mapNgramsDocsNoCount = over (traverse . traverse . traverse) fst mapNgramsDocs'
432 terms2id <- insertExtractedNgrams $ HashMap.keys mapNgramsDocsNoCount
433
434 let mapNgramsDocs = HashMap.mapKeys extracted2ngrams mapNgramsDocs'
435
436 -- new
437 mapCgramsId <- listInsertDb lId toNodeNgramsW'
438 $ map (first _ngramsTerms . second Map.keys)
439 $ HashMap.toList mapNgramsDocs
440
441 --printDebug "saveDocNgramsWith" mapCgramsId
442 -- insertDocNgrams
443 let ngrams2insert = catMaybes [ ContextNodeNgrams2 <$> Just nId
444 <*> (getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms''))
445 <*> Just (fromIntegral w :: Double)
446 | (terms'', mapNgramsTypes) <- HashMap.toList mapNgramsDocs
447 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
448 , (nId, (w, _cnt)) <- Map.toList mapNodeIdWeight
449 ]
450 -- printDebug "Ngrams2Insert" ngrams2insert
451 _return <- insertContextNodeNgrams2 ngrams2insert
452
453 -- to be removed
454 _ <- insertDocNgrams lId $ HashMap.mapKeys (indexNgrams terms2id) mapNgramsDocs
455
456 pure ()
457
458
459 ------------------------------------------------------------------------
460 -- TODO Type NodeDocumentUnicised
461 insertDocs :: ( FlowCmdM env err m
462 -- , FlowCorpus a
463 , FlowInsertDB a
464 )
465 => UserId
466 -> CorpusId
467 -> [a]
468 -> m ([ContextId], [Indexed ContextId a])
469 insertDocs uId cId hs = do
470 let docs = map addUniqId hs
471 newIds <- insertDb uId Nothing docs
472 -- printDebug "newIds" newIds
473 let
474 newIds' = map reId newIds
475 documentsWithId = mergeData (toInserted newIds) (Map.fromList $ map viewUniqId' docs)
476 _ <- Doc.add cId newIds'
477 pure (newIds', documentsWithId)
478
479
480 ------------------------------------------------------------------------
481 viewUniqId' :: UniqId a
482 => a
483 -> (Hash, a)
484 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
485 where
486 err = panic "[ERROR] Database.Flow.toInsert"
487
488
489 toInserted :: [ReturnId]
490 -> Map Hash ReturnId
491 toInserted =
492 Map.fromList . map (\r -> (reUniqId r, r) )
493 . filter (\r -> reInserted r == True)
494
495 mergeData :: Map Hash ReturnId
496 -> Map Hash a
497 -> [Indexed NodeId a]
498 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
499 where
500 toDocumentWithId (sha,hpd) =
501 Indexed <$> fmap reId (lookup sha rs)
502 <*> Just hpd
503
504 ------------------------------------------------------------------------
505 ------------------------------------------------------------------------
506 ------------------------------------------------------------------------
507 documentIdWithNgrams :: HasNodeError err
508 => (a
509 -> Cmd err (HashMap b (Map NgramsType Int, TermsCount)))
510 -> [Indexed NodeId a]
511 -> Cmd err [DocumentIdWithNgrams a b]
512 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
513 where
514 toDocumentIdWithNgrams d = do
515 e <- f $ _unIndex d
516 pure $ DocumentIdWithNgrams d e
517
518
519 -- | TODO check optimization
520 mapNodeIdNgrams :: (Ord b, Hashable b)
521 => [DocumentIdWithNgrams a b]
522 -> HashMap b
523 (Map NgramsType
524 (Map NodeId (Int, TermsCount))
525 )
526 mapNodeIdNgrams = HashMap.unionsWith (Map.unionWith (Map.unionWith addTuples)) . fmap f
527 where
528 -- | NOTE We are somehow multiplying 'TermsCount' here: If the
529 -- same ngrams term has different ngrams types, the 'TermsCount'
530 -- for it (which is the number of times the terms appears in a
531 -- document) is copied over to all its types.
532 f :: DocumentIdWithNgrams a b
533 -> HashMap b (Map NgramsType (Map NodeId (Int, TermsCount)))
534 f d = fmap (\(ngramsTypeMap, cnt) -> fmap (\i -> Map.singleton nId (i, cnt)) ngramsTypeMap) $ documentNgrams d
535 where
536 nId = _index $ documentWithId d
537
538
539 ------------------------------------------------------------------------
540 instance ExtractNgramsT HyperdataContact
541 where
542 extractNgramsT l hc = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extract l hc
543 where
544 extract :: TermType Lang -> HyperdataContact
545 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int, TermsCount))
546 extract _l hc' = do
547 let authors = map text2ngrams
548 $ maybe ["Nothing"] (\a -> [a])
549 $ view (hc_who . _Just . cw_lastName) hc'
550
551 pure $ HashMap.fromList $ [(SimpleNgrams a', (Map.singleton Authors 1, 1)) | a' <- authors ]
552
553
554 instance ExtractNgramsT HyperdataDocument
555 where
556 extractNgramsT :: TermType Lang
557 -> HyperdataDocument
558 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int, TermsCount))
559 extractNgramsT lang hd = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extractNgramsT' lang hd
560 where
561 extractNgramsT' :: TermType Lang
562 -> HyperdataDocument
563 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int, TermsCount))
564 extractNgramsT' lang' doc = do
565 let source = text2ngrams
566 $ maybe "Nothing" identity
567 $ _hd_source doc
568
569 institutes = map text2ngrams
570 $ maybe ["Nothing"] (splitOn Institutes (doc^. hd_bdd))
571 $ _hd_institutes doc
572
573 authors = map text2ngrams
574 $ maybe ["Nothing"] (splitOn Authors (doc^. hd_bdd))
575 $ _hd_authors doc
576
577 ncs <- view (nlpServerGet $ lang' ^. tt_lang)
578
579 termsWithCounts' <- map (\(t, cnt) -> (enrichedTerms (lang' ^. tt_lang) CoreNLP NP t, cnt))
580 <$> concat
581 <$> liftBase (extractTerms ncs lang' $ hasText doc)
582
583 pure $ HashMap.fromList
584 $ [(SimpleNgrams source, (Map.singleton Sources 1, 1)) ]
585 <> [(SimpleNgrams i', (Map.singleton Institutes 1, 1)) | i' <- institutes ]
586 <> [(SimpleNgrams a', (Map.singleton Authors 1, 1)) | a' <- authors ]
587 <> [(EnrichedNgrams t', (Map.singleton NgramsTerms 1, cnt')) | (t', cnt') <- termsWithCounts' ]
588
589 instance (ExtractNgramsT a, HasText a) => ExtractNgramsT (Node a)
590 where
591 extractNgramsT l (Node { _node_hyperdata = h }) = extractNgramsT l h
592
593 instance HasText a => HasText (Node a)
594 where
595 hasText (Node { _node_hyperdata = h }) = hasText h
596
597
598
599 -- | TODO putelsewhere
600 -- | Upgrade function
601 -- Suppose all documents are English (this is the case actually)
602 indexAllDocumentsWithPosTag :: FlowCmdM env err m
603 => m ()
604 indexAllDocumentsWithPosTag = do
605 rootId <- getRootId (UserName userMaster)
606 corpusIds <- findNodesId rootId [NodeCorpus]
607 docs <- List.concat <$> mapM getDocumentsWithParentId corpusIds
608 _ <- mapM extractInsert (splitEvery 1000 docs)
609 pure ()
610
611 extractInsert :: FlowCmdM env err m
612 => [Node HyperdataDocument] -> m ()
613 extractInsert docs = do
614 let documentsWithId = map (\doc -> Indexed (doc ^. node_id) doc) docs
615 mapNgramsDocs' <- mapNodeIdNgrams
616 <$> documentIdWithNgrams
617 (extractNgramsT $ withLang (Multi EN) documentsWithId)
618 documentsWithId
619 _ <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
620 pure ()