]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Flow.hs
[nlp] add sample support for languages to corenlp
[gargantext.git] / src / Gargantext / Database / Action / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -- TODO-ACCESS:
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
13
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
16 -}
17
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
19
20 {-# LANGUAGE ConstrainedClassMethods #-}
21 {-# LANGUAGE ConstraintKinds #-}
22 {-# LANGUAGE InstanceSigs #-}
23 {-# LANGUAGE ScopedTypeVariables #-}
24 {-# LANGUAGE TemplateHaskell #-}
25
26 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
27 ( DataText(..)
28 , getDataText
29 , getDataText_Debug
30 , flowDataText
31 , flow
32
33 , flowCorpusFile
34 , flowCorpus
35 , flowCorpusUser
36 , flowAnnuaire
37 , insertMasterDocs
38 , saveDocNgramsWith
39
40 , getOrMkRoot
41 , getOrMk_RootWithCorpus
42 , TermType(..)
43 , DataOrigin(..)
44 , allDataOrigins
45
46 , do_api
47 , indexAllDocumentsWithPosTag
48 )
49 where
50
51 import Conduit
52 import Control.Lens ((^.), view, _Just, makeLenses, over, traverse)
53 import Control.Monad.Reader (MonadReader)
54 import Data.Aeson.TH (deriveJSON)
55 import Data.Conduit.Internal (zipSources)
56 import qualified Data.Conduit.List as CList
57 import Data.Either
58 import Data.HashMap.Strict (HashMap)
59 import Data.Hashable (Hashable)
60 import Data.List (concat)
61 import Data.Map.Strict (Map, lookup)
62 import Data.Maybe (catMaybes)
63 import Data.Monoid
64 import Data.Swagger
65 import qualified Data.Text as T
66 import Data.Tuple.Extra (first, second)
67 import GHC.Generics (Generic)
68 import Servant.Client (ClientError)
69 import System.FilePath (FilePath)
70 import qualified Data.HashMap.Strict as HashMap
71 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
72 import qualified Data.Map.Strict as Map
73 import qualified Data.Conduit.List as CL
74 import qualified Data.Conduit as C
75
76 import Gargantext.Core (Lang(..), PosTagAlgo(..))
77 -- import Gargantext.Core.Ext.IMT (toSchoolName)
78 import Gargantext.Core.Ext.IMTUser (readFile_Annuaire)
79 import Gargantext.Core.Flow.Types
80 import Gargantext.Core.NLP (nlpServerGet)
81 import Gargantext.Core.Text
82 import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat, FileType, splitOn)
83 import Gargantext.Core.Text.List (buildNgramsLists)
84 import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
85 import Gargantext.Core.Text.List.Social (FlowSocialListWith(..))
86 import Gargantext.Core.Text.Terms
87 import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
88 import Gargantext.Core.Types (POS(NP), TermsCount)
89 import Gargantext.Core.Types.Individu (User(..))
90 import Gargantext.Core.Types.Main
91 import Gargantext.Core.Utils (addTuples)
92 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
93 import Gargantext.Database.Action.Flow.List
94 import Gargantext.Database.Action.Flow.Types
95 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams, DocumentIdWithNgrams(..))
96 import Gargantext.Database.Action.Search (searchDocInDatabase)
97 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
98 import Gargantext.Database.Action.Metrics (updateNgramsOccurrences)
99 import Gargantext.Database.Admin.Types.Hyperdata
100 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
101 import Gargantext.Database.Prelude
102 import Gargantext.Database.Query.Table.ContextNodeNgrams2
103 import Gargantext.Database.Query.Table.Ngrams
104 import Gargantext.Database.Query.Table.Node
105 import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
106 import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
107 import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
108 import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
109 import Gargantext.Database.Schema.Node (NodePoly(..), node_id)
110 import Gargantext.Database.Types
111 import Gargantext.Prelude
112 import Gargantext.Prelude.Crypto.Hash (Hash)
113 import Gargantext.Utils.Jobs (JobHandle, MonadJobStatus(..))
114 import qualified Gargantext.Core.Text.Corpus.API as API
115 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
116 --import qualified Prelude
117
118 ------------------------------------------------------------------------
119 -- Imports for upgrade function
120 import Gargantext.Database.Query.Tree.Root (getRootId)
121 import Gargantext.Database.Query.Tree (findNodesId)
122 import qualified Data.List as List
123 ------------------------------------------------------------------------
124 -- TODO use internal with API name (could be old data)
125 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
126 | ExternalOrigin { _do_api :: API.ExternalAPIs }
127 -- TODO Web
128 deriving (Generic, Eq)
129
130 makeLenses ''DataOrigin
131 deriveJSON (unPrefix "_do_") ''DataOrigin
132 instance ToSchema DataOrigin where
133 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
134
135 allDataOrigins :: ( MonadReader env m
136 , HasConfig env) => m [DataOrigin]
137 allDataOrigins = do
138 ext <- API.externalAPIs
139
140 pure $ map InternalOrigin ext
141 <> map ExternalOrigin ext
142
143 ---------------
144 data DataText = DataOld ![NodeId]
145 | DataNew !(Maybe Integer, ConduitT () HyperdataDocument IO ())
146 --- | DataNew ![[HyperdataDocument]]
147
148 -- Show instance is not possible because of IO
149 printDataText :: DataText -> IO ()
150 printDataText (DataOld xs) = putStrLn $ show xs
151 printDataText (DataNew (maybeInt, conduitData)) = do
152 res <- C.runConduit (conduitData .| CL.consume)
153 putStrLn $ show (maybeInt, res)
154
155 -- TODO use the split parameter in config file
156 getDataText :: FlowCmdM env err m
157 => DataOrigin
158 -> TermType Lang
159 -> API.Query
160 -> Maybe API.Limit
161 -> m (Either ClientError DataText)
162 getDataText (ExternalOrigin api) la q li = liftBase $ do
163 eRes <- API.get api (_tt_lang la) q li
164 pure $ DataNew <$> eRes
165
166 getDataText (InternalOrigin _) _la q _li = do
167 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
168 (UserName userMaster)
169 (Left "")
170 (Nothing :: Maybe HyperdataCorpus)
171 ids <- map fst <$> searchDocInDatabase cId (stemIt q)
172 pure $ Right $ DataOld ids
173
174 getDataText_Debug :: FlowCmdM env err m
175 => DataOrigin
176 -> TermType Lang
177 -> API.Query
178 -> Maybe API.Limit
179 -> m ()
180 getDataText_Debug a l q li = do
181 result <- getDataText a l q li
182 case result of
183 Left err -> liftBase $ putStrLn $ show err
184 Right res -> liftBase $ printDataText res
185
186
187 -------------------------------------------------------------------------------
188 flowDataText :: forall env err m.
189 ( FlowCmdM env err m
190 , MonadJobStatus m
191 )
192 => User
193 -> DataText
194 -> TermType Lang
195 -> CorpusId
196 -> Maybe FlowSocialListWith
197 -> JobHandle m
198 -> m CorpusId
199 flowDataText u (DataOld ids) tt cid mfslw _ = do
200 (_userId, userCorpusId, listId) <- createNodes u (Right [cid]) corpusType
201 _ <- Doc.add userCorpusId ids
202 flowCorpusUser (_tt_lang tt) u userCorpusId listId corpusType mfslw
203 where
204 corpusType = (Nothing :: Maybe HyperdataCorpus)
205 flowDataText u (DataNew (mLen, txtC)) tt cid mfslw jobHandle =
206 flowCorpus u (Right [cid]) tt mfslw (mLen, (transPipe liftBase txtC)) jobHandle
207
208 ------------------------------------------------------------------------
209 -- TODO use proxy
210 flowAnnuaire :: (FlowCmdM env err m, MonadJobStatus m)
211 => User
212 -> Either CorpusName [CorpusId]
213 -> (TermType Lang)
214 -> FilePath
215 -> JobHandle m
216 -> m AnnuaireId
217 flowAnnuaire u n l filePath jobHandle = do
218 -- TODO Conduit for file
219 docs <- liftBase $ ((readFile_Annuaire filePath) :: IO [HyperdataContact])
220 flow (Nothing :: Maybe HyperdataAnnuaire) u n l Nothing (Just $ fromIntegral $ length docs, yieldMany docs) jobHandle
221
222 ------------------------------------------------------------------------
223 flowCorpusFile :: (FlowCmdM env err m, MonadJobStatus m)
224 => User
225 -> Either CorpusName [CorpusId]
226 -> Limit -- Limit the number of docs (for dev purpose)
227 -> TermType Lang
228 -> FileType
229 -> FileFormat
230 -> FilePath
231 -> Maybe FlowSocialListWith
232 -> JobHandle m
233 -> m CorpusId
234 flowCorpusFile u n _l la ft ff fp mfslw jobHandle = do
235 eParsed <- liftBase $ parseFile ft ff fp
236 case eParsed of
237 Right parsed -> do
238 flowCorpus u n la mfslw (Just $ fromIntegral $ length parsed, yieldMany parsed .| mapC toHyperdataDocument) jobHandle
239 --let docs = splitEvery 500 $ take l parsed
240 --flowCorpus u n la mfslw (yieldMany $ map (map toHyperdataDocument) docs) logStatus
241 Left e -> panic $ "Error: " <> T.pack e
242
243 ------------------------------------------------------------------------
244 -- | TODO improve the needed type to create/update a corpus
245 -- (For now, Either is enough)
246 flowCorpus :: (FlowCmdM env err m, FlowCorpus a, MonadJobStatus m)
247 => User
248 -> Either CorpusName [CorpusId]
249 -> TermType Lang
250 -> Maybe FlowSocialListWith
251 -> (Maybe Integer, ConduitT () a m ())
252 -> JobHandle m
253 -> m CorpusId
254 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
255
256
257 flow :: forall env err m a c.
258 ( FlowCmdM env err m
259 , FlowCorpus a
260 , MkCorpus c
261 , MonadJobStatus m
262 )
263 => Maybe c
264 -> User
265 -> Either CorpusName [CorpusId]
266 -> TermType Lang
267 -> Maybe FlowSocialListWith
268 -> (Maybe Integer, ConduitT () a m ())
269 -> JobHandle m
270 -> m CorpusId
271 flow c u cn la mfslw (mLength, docsC) jobHandle = do
272 (_userId, userCorpusId, listId) <- createNodes u cn c
273 -- TODO if public insertMasterDocs else insertUserDocs
274 _ <- runConduit $ zipSources (yieldMany [1..]) docsC
275 .| CList.chunksOf 100
276 .| mapMC insertDocs'
277 .| mapM_C (\ids' -> do
278 _ <- Doc.add userCorpusId ids'
279 pure ())
280 .| sinkList
281
282 _ <- flowCorpusUser (la ^. tt_lang) u userCorpusId listId c mfslw
283
284 -- ids <- traverse (\(idx, doc) -> do
285 -- id <- insertMasterDocs c la doc
286 -- logStatus JobLog { _scst_succeeded = Just $ 1 + idx
287 -- , _scst_failed = Just 0
288 -- , _scst_remaining = Just $ length docs - idx
289 -- , _scst_events = Just []
290 -- }
291 -- pure id
292 -- ) (zip [1..] docs)
293 --printDebug "[flow] calling flowCorpusUser" (0 :: Int)
294 pure userCorpusId
295 --flowCorpusUser (la ^. tt_lang) u cn c ids mfslw
296
297 where
298 insertDocs' :: [(Integer, a)] -> m [NodeId]
299 insertDocs' [] = pure []
300 insertDocs' docs = do
301 -- printDebug "[flow] calling insertDoc, ([idx], mLength) = " (fst <$> docs, mLength)
302 ids <- insertMasterDocs c la (snd <$> docs)
303 let maxIdx = maximum (fst <$> docs)
304 case mLength of
305 Nothing -> pure ()
306 Just len -> do
307
308 let succeeded = fromIntegral (1 + maxIdx)
309 let remaining = fromIntegral (len - maxIdx)
310 -- Reconstruct the correct update state by using 'markStarted' and the other primitives.
311 -- We do this slightly awkward arithmetic such that when we call 'markProgress' we reduce
312 -- the number of 'remaining' of exactly '1 + maxIdx', and we will end up with a 'JobLog'
313 -- looking like this:
314 -- JobLog
315 -- { _scst_succeeded = Just $ fromIntegral $ 1 + maxIdx
316 -- , _scst_failed = Just 0
317 -- , _scst_remaining = Just $ fromIntegral $ len - maxIdx
318 -- , _scst_events = Just []
319 -- }
320 markStarted (remaining + succeeded) jobHandle
321 markProgress succeeded jobHandle
322
323 pure ids
324
325
326
327 ------------------------------------------------------------------------
328 createNodes :: ( FlowCmdM env err m
329 , MkCorpus c
330 )
331 => User
332 -> Either CorpusName [CorpusId]
333 -> Maybe c
334 -> m (UserId, CorpusId, ListId)
335 createNodes user corpusName ctype = do
336 -- User Flow
337 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
338 -- NodeTexts is first
339 _tId <- insertDefaultNodeIfNotExists NodeTexts userCorpusId userId
340 -- printDebug "NodeTexts: " tId
341
342 -- NodeList is second
343 listId <- getOrMkList userCorpusId userId
344
345 -- User Graph Flow
346 _ <- insertDefaultNodeIfNotExists NodeGraph userCorpusId userId
347 _ <- insertDefaultNodeIfNotExists NodeDashboard userCorpusId userId
348
349 pure (userId, userCorpusId, listId)
350
351
352 flowCorpusUser :: ( FlowCmdM env err m
353 , MkCorpus c
354 )
355 => Lang
356 -> User
357 -> CorpusId
358 -> ListId
359 -> Maybe c
360 -> Maybe FlowSocialListWith
361 -> m CorpusId
362 flowCorpusUser l user userCorpusId listId ctype mfslw = do
363 server <- view (nlpServerGet l)
364 -- User List Flow
365 (masterUserId, _masterRootId, masterCorpusId)
366 <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
367
368 --let gp = (GroupParams l 2 3 (StopSize 3))
369 -- Here the PosTagAlgo should be chosen according to the Lang
370 _ <- case mfslw of
371 (Just (NoList _)) -> do
372 -- printDebug "Do not build list" mfslw
373 pure ()
374 _ -> do
375 ngs <- buildNgramsLists user userCorpusId masterCorpusId mfslw
376 $ GroupWithPosTag l server HashMap.empty
377
378 -- printDebug "flowCorpusUser:ngs" ngs
379
380 _userListId <- flowList_DbRepo listId ngs
381 _mastListId <- getOrMkList masterCorpusId masterUserId
382 pure ()
383 -- _ <- insertOccsUpdates userCorpusId mastListId
384 -- printDebug "userListId" userListId
385 --_ <- mkPhylo userCorpusId userId
386 -- Annuaire Flow
387 -- _ <- mkAnnuaire rootUserId userId
388 _ <- updateNgramsOccurrences userCorpusId (Just listId)
389
390 pure userCorpusId
391
392
393 insertMasterDocs :: ( FlowCmdM env err m
394 , FlowCorpus a
395 , MkCorpus c
396 )
397 => Maybe c
398 -> TermType Lang
399 -> [a]
400 -> m [DocId]
401 insertMasterDocs c lang hs = do
402 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
403 (ids', documentsWithId) <- insertDocs masterUserId masterCorpusId (map (toNode masterUserId Nothing) hs )
404 _ <- Doc.add masterCorpusId ids'
405 -- TODO
406 -- create a corpus with database name (CSV or PubMed)
407 -- add documents to the corpus (create node_node link)
408 -- this will enable global database monitoring
409
410 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
411 mapNgramsDocs' :: HashMap ExtractedNgrams (Map NgramsType (Map NodeId (Int, TermsCount)))
412 <- mapNodeIdNgrams
413 <$> documentIdWithNgrams
414 (extractNgramsT $ withLang lang documentsWithId)
415 documentsWithId
416
417 lId <- getOrMkList masterCorpusId masterUserId
418 -- _ <- saveDocNgramsWith lId mapNgramsDocs'
419 _ <- saveDocNgramsWith lId mapNgramsDocs'
420
421 -- _cooc <- insertDefaultNode NodeListCooc lId masterUserId
422 pure ids'
423
424 saveDocNgramsWith :: (FlowCmdM env err m)
425 => ListId
426 -> HashMap ExtractedNgrams (Map NgramsType (Map NodeId (Int, TermsCount)))
427 -> m ()
428 saveDocNgramsWith lId mapNgramsDocs' = do
429 --printDebug "[saveDocNgramsWith] mapNgramsDocs'" mapNgramsDocs'
430 let mapNgramsDocsNoCount = over (traverse . traverse . traverse) fst mapNgramsDocs'
431 terms2id <- insertExtractedNgrams $ HashMap.keys mapNgramsDocsNoCount
432
433 let mapNgramsDocs = HashMap.mapKeys extracted2ngrams mapNgramsDocs'
434
435 -- new
436 mapCgramsId <- listInsertDb lId toNodeNgramsW'
437 $ map (first _ngramsTerms . second Map.keys)
438 $ HashMap.toList mapNgramsDocs
439
440 --printDebug "saveDocNgramsWith" mapCgramsId
441 -- insertDocNgrams
442 let ngrams2insert = catMaybes [ ContextNodeNgrams2 <$> Just nId
443 <*> (getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms''))
444 <*> Just (fromIntegral w :: Double)
445 | (terms'', mapNgramsTypes) <- HashMap.toList mapNgramsDocs
446 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
447 , (nId, (w, _cnt)) <- Map.toList mapNodeIdWeight
448 ]
449 -- printDebug "Ngrams2Insert" ngrams2insert
450 _return <- insertContextNodeNgrams2 ngrams2insert
451
452 -- to be removed
453 _ <- insertDocNgrams lId $ HashMap.mapKeys (indexNgrams terms2id) mapNgramsDocs
454
455 pure ()
456
457
458 ------------------------------------------------------------------------
459 -- TODO Type NodeDocumentUnicised
460 insertDocs :: ( FlowCmdM env err m
461 -- , FlowCorpus a
462 , FlowInsertDB a
463 )
464 => UserId
465 -> CorpusId
466 -> [a]
467 -> m ([ContextId], [Indexed ContextId a])
468 insertDocs uId cId hs = do
469 let docs = map addUniqId hs
470 newIds <- insertDb uId Nothing docs
471 -- printDebug "newIds" newIds
472 let
473 newIds' = map reId newIds
474 documentsWithId = mergeData (toInserted newIds) (Map.fromList $ map viewUniqId' docs)
475 _ <- Doc.add cId newIds'
476 pure (newIds', documentsWithId)
477
478
479 ------------------------------------------------------------------------
480 viewUniqId' :: UniqId a
481 => a
482 -> (Hash, a)
483 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
484 where
485 err = panic "[ERROR] Database.Flow.toInsert"
486
487
488 toInserted :: [ReturnId]
489 -> Map Hash ReturnId
490 toInserted =
491 Map.fromList . map (\r -> (reUniqId r, r) )
492 . filter (\r -> reInserted r == True)
493
494 mergeData :: Map Hash ReturnId
495 -> Map Hash a
496 -> [Indexed NodeId a]
497 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
498 where
499 toDocumentWithId (sha,hpd) =
500 Indexed <$> fmap reId (lookup sha rs)
501 <*> Just hpd
502
503 ------------------------------------------------------------------------
504 ------------------------------------------------------------------------
505 ------------------------------------------------------------------------
506 documentIdWithNgrams :: HasNodeError err
507 => (a
508 -> Cmd err (HashMap b (Map NgramsType Int, TermsCount)))
509 -> [Indexed NodeId a]
510 -> Cmd err [DocumentIdWithNgrams a b]
511 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
512 where
513 toDocumentIdWithNgrams d = do
514 e <- f $ _unIndex d
515 pure $ DocumentIdWithNgrams d e
516
517
518 -- | TODO check optimization
519 mapNodeIdNgrams :: (Ord b, Hashable b)
520 => [DocumentIdWithNgrams a b]
521 -> HashMap b
522 (Map NgramsType
523 (Map NodeId (Int, TermsCount))
524 )
525 mapNodeIdNgrams = HashMap.unionsWith (Map.unionWith (Map.unionWith addTuples)) . fmap f
526 where
527 -- | NOTE We are somehow multiplying 'TermsCount' here: If the
528 -- same ngrams term has different ngrams types, the 'TermsCount'
529 -- for it (which is the number of times the terms appears in a
530 -- document) is copied over to all its types.
531 f :: DocumentIdWithNgrams a b
532 -> HashMap b (Map NgramsType (Map NodeId (Int, TermsCount)))
533 f d = fmap (\(ngramsTypeMap, cnt) -> fmap (\i -> Map.singleton nId (i, cnt)) ngramsTypeMap) $ documentNgrams d
534 where
535 nId = _index $ documentWithId d
536
537
538 ------------------------------------------------------------------------
539 instance ExtractNgramsT HyperdataContact
540 where
541 extractNgramsT l hc = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extract l hc
542 where
543 extract :: TermType Lang -> HyperdataContact
544 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int, TermsCount))
545 extract _l hc' = do
546 let authors = map text2ngrams
547 $ maybe ["Nothing"] (\a -> [a])
548 $ view (hc_who . _Just . cw_lastName) hc'
549
550 pure $ HashMap.fromList $ [(SimpleNgrams a', (Map.singleton Authors 1, 1)) | a' <- authors ]
551
552
553 instance ExtractNgramsT HyperdataDocument
554 where
555 extractNgramsT :: TermType Lang
556 -> HyperdataDocument
557 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int, TermsCount))
558 extractNgramsT lang hd = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extractNgramsT' lang hd
559 where
560 extractNgramsT' :: TermType Lang
561 -> HyperdataDocument
562 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int, TermsCount))
563 extractNgramsT' lang' doc = do
564 let source = text2ngrams
565 $ maybe "Nothing" identity
566 $ _hd_source doc
567
568 institutes = map text2ngrams
569 $ maybe ["Nothing"] (splitOn Institutes (doc^. hd_bdd))
570 $ _hd_institutes doc
571
572 authors = map text2ngrams
573 $ maybe ["Nothing"] (splitOn Authors (doc^. hd_bdd))
574 $ _hd_authors doc
575
576 ncs <- view (nlpServerGet $ lang' ^. tt_lang)
577
578 termsWithCounts' <- map (\(t, cnt) -> (enrichedTerms (lang' ^. tt_lang) CoreNLP NP t, cnt))
579 <$> concat
580 <$> liftBase (extractTerms ncs lang' $ hasText doc)
581
582 pure $ HashMap.fromList
583 $ [(SimpleNgrams source, (Map.singleton Sources 1, 1)) ]
584 <> [(SimpleNgrams i', (Map.singleton Institutes 1, 1)) | i' <- institutes ]
585 <> [(SimpleNgrams a', (Map.singleton Authors 1, 1)) | a' <- authors ]
586 <> [(EnrichedNgrams t', (Map.singleton NgramsTerms 1, cnt')) | (t', cnt') <- termsWithCounts' ]
587
588 instance (ExtractNgramsT a, HasText a) => ExtractNgramsT (Node a)
589 where
590 extractNgramsT l (Node { _node_hyperdata = h }) = extractNgramsT l h
591
592 instance HasText a => HasText (Node a)
593 where
594 hasText (Node { _node_hyperdata = h }) = hasText h
595
596
597
598 -- | TODO putelsewhere
599 -- | Upgrade function
600 -- Suppose all documents are English (this is the case actually)
601 indexAllDocumentsWithPosTag :: FlowCmdM env err m
602 => m ()
603 indexAllDocumentsWithPosTag = do
604 rootId <- getRootId (UserName userMaster)
605 corpusIds <- findNodesId rootId [NodeCorpus]
606 docs <- List.concat <$> mapM getDocumentsWithParentId corpusIds
607 _ <- mapM extractInsert (splitEvery 1000 docs)
608 pure ()
609
610 extractInsert :: FlowCmdM env err m
611 => [Node HyperdataDocument] -> m ()
612 extractInsert docs = do
613 let documentsWithId = map (\doc -> Indexed (doc ^. node_id) doc) docs
614 mapNgramsDocs' <- mapNodeIdNgrams
615 <$> documentIdWithNgrams
616 (extractNgramsT $ withLang (Multi EN) documentsWithId)
617 documentsWithId
618 _ <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
619 pure ()