]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Flow.hs
[flow] add better progress report to flow corpus
[gargantext.git] / src / Gargantext / Database / Action / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -- TODO-ACCESS:
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
13
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
16 -}
17
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
19
20 {-# LANGUAGE ConstrainedClassMethods #-}
21 {-# LANGUAGE ConstraintKinds #-}
22 {-# LANGUAGE InstanceSigs #-}
23 {-# LANGUAGE ScopedTypeVariables #-}
24 {-# LANGUAGE TemplateHaskell #-}
25
26 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
27 ( DataText(..)
28 , getDataText
29 , flowDataText
30 , flow
31
32 , flowCorpusFile
33 , flowCorpus
34 , flowAnnuaire
35 , insertMasterDocs
36 , saveDocNgramsWith
37
38 , getOrMkRoot
39 , getOrMk_RootWithCorpus
40 , TermType(..)
41 , DataOrigin(..)
42 , allDataOrigins
43
44 , do_api
45 , indexAllDocumentsWithPosTag
46 )
47 where
48
49 import Control.Lens ((^.), view, _Just, makeLenses)
50 import Data.Aeson.TH (deriveJSON)
51 import Data.Either
52 import Data.HashMap.Strict (HashMap)
53 import Data.Hashable (Hashable)
54 import Data.List (concat)
55 import Data.Map (Map, lookup)
56 import Data.Maybe (catMaybes)
57 import Data.Monoid
58 import Data.Swagger
59 import qualified Data.Text as T
60 import Data.Traversable (traverse)
61 import Data.Tuple.Extra (first, second)
62 import GHC.Generics (Generic)
63 import System.FilePath (FilePath)
64 import qualified Data.HashMap.Strict as HashMap
65 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
66 import qualified Data.Map as Map
67
68 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..))
69 import Gargantext.Core (Lang(..), PosTagAlgo(..))
70 import Gargantext.Core.Ext.IMT (toSchoolName)
71 import Gargantext.Core.Ext.IMTUser (readFile_Annuaire)
72 import Gargantext.Core.Flow.Types
73 import Gargantext.Core.Text
74 import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
75 import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat)
76 import Gargantext.Core.Text.List (buildNgramsLists)
77 import Gargantext.Core.Text.List.Social (FlowSocialListWith)
78 import Gargantext.Core.Text.Terms
79 import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
80 import Gargantext.Core.Types (POS(NP))
81 import Gargantext.Core.Types.Individu (User(..))
82 import Gargantext.Core.Types.Main
83 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
84 import Gargantext.Database.Action.Flow.List
85 import Gargantext.Database.Action.Flow.Types
86 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams, DocumentIdWithNgrams(..))
87 import Gargantext.Database.Action.Search (searchDocInDatabase)
88 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
89 import Gargantext.Database.Admin.Types.Hyperdata
90 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
91 import Gargantext.Database.Prelude
92 import Gargantext.Database.Query.Table.Ngrams
93 import Gargantext.Database.Query.Table.Node
94 import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
95 import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
96 import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
97 import Gargantext.Database.Query.Table.NodeNodeNgrams2
98 import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
99 import Gargantext.Database.Schema.Node (NodePoly(..), node_id)
100 import Gargantext.Database.Types
101 import Gargantext.Prelude
102 import Gargantext.Prelude.Crypto.Hash (Hash)
103 import qualified Gargantext.Core.Text.Corpus.API as API
104 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
105
106 ------------------------------------------------------------------------
107 -- Imports for upgrade function
108 import Gargantext.Database.Query.Tree.Root (getRootId)
109 import Gargantext.Database.Query.Tree (findNodesId)
110 import qualified Data.List as List
111 ------------------------------------------------------------------------
112 -- TODO use internal with API name (could be old data)
113 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
114 | ExternalOrigin { _do_api :: API.ExternalAPIs }
115 -- TODO Web
116 deriving (Generic, Eq)
117
118 makeLenses ''DataOrigin
119 deriveJSON (unPrefix "_do_") ''DataOrigin
120 instance ToSchema DataOrigin where
121 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
122
123 allDataOrigins :: [DataOrigin]
124 allDataOrigins = map InternalOrigin API.externalAPIs
125 <> map ExternalOrigin API.externalAPIs
126
127 ---------------
128 data DataText = DataOld ![NodeId]
129 | DataNew ![[HyperdataDocument]]
130
131 -- TODO use the split parameter in config file
132 getDataText :: FlowCmdM env err m
133 => DataOrigin
134 -> TermType Lang
135 -> API.Query
136 -> Maybe API.Limit
137 -> m DataText
138 getDataText (ExternalOrigin api) la q li = liftBase $ DataNew
139 <$> splitEvery 500
140 <$> API.get api (_tt_lang la) q li
141
142 getDataText (InternalOrigin _) _la q _li = do
143 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
144 (UserName userMaster)
145 (Left "")
146 (Nothing :: Maybe HyperdataCorpus)
147 ids <- map fst <$> searchDocInDatabase cId (stemIt q)
148 pure $ DataOld ids
149
150 -------------------------------------------------------------------------------
151 flowDataText :: ( FlowCmdM env err m
152 )
153 => User
154 -> DataText
155 -> TermType Lang
156 -> CorpusId
157 -> Maybe FlowSocialListWith
158 -> (JobLog -> m ())
159 -> m CorpusId
160 flowDataText u (DataOld ids) tt cid mfslw _ = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids mfslw
161 where
162 corpusType = (Nothing :: Maybe HyperdataCorpus)
163 flowDataText u (DataNew txt) tt cid mfslw logStatus = flowCorpus u (Right [cid]) tt mfslw txt logStatus
164
165 ------------------------------------------------------------------------
166 -- TODO use proxy
167 flowAnnuaire :: (FlowCmdM env err m)
168 => User
169 -> Either CorpusName [CorpusId]
170 -> (TermType Lang)
171 -> FilePath
172 -> (JobLog -> m ())
173 -> m AnnuaireId
174 flowAnnuaire u n l filePath logStatus = do
175 docs <- liftBase $ (( splitEvery 500 <$> readFile_Annuaire filePath) :: IO [[HyperdataContact]])
176 flow (Nothing :: Maybe HyperdataAnnuaire) u n l Nothing docs logStatus
177
178 ------------------------------------------------------------------------
179 flowCorpusFile :: (FlowCmdM env err m)
180 => User
181 -> Either CorpusName [CorpusId]
182 -> Limit -- Limit the number of docs (for dev purpose)
183 -> TermType Lang -> FileFormat -> FilePath
184 -> Maybe FlowSocialListWith
185 -> (JobLog -> m ())
186 -> m CorpusId
187 flowCorpusFile u n l la ff fp mfslw logStatus = do
188 eParsed <- liftBase $ parseFile ff fp
189 case eParsed of
190 Right parsed -> do
191 let docs = splitEvery 500 $ take l parsed
192 flowCorpus u n la mfslw (map (map toHyperdataDocument) docs) logStatus
193 Left e -> panic $ "Error: " <> (T.pack e)
194
195 ------------------------------------------------------------------------
196 -- | TODO improve the needed type to create/update a corpus
197 -- (For now, Either is enough)
198 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
199 => User
200 -> Either CorpusName [CorpusId]
201 -> TermType Lang
202 -> Maybe FlowSocialListWith
203 -> [[a]]
204 -> (JobLog -> m ())
205 -> m CorpusId
206 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
207
208
209 flow :: ( FlowCmdM env err m
210 , FlowCorpus a
211 , MkCorpus c
212 )
213 => Maybe c
214 -> User
215 -> Either CorpusName [CorpusId]
216 -> TermType Lang
217 -> Maybe FlowSocialListWith
218 -> [[a]]
219 -> (JobLog -> m ())
220 -> m CorpusId
221 flow c u cn la mfslw docs logStatus = do
222 -- TODO if public insertMasterDocs else insertUserDocs
223 ids <- traverse (\(idx, doc) -> do
224 id <- insertMasterDocs c la doc
225 logStatus JobLog { _scst_succeeded = Just $ 1 + idx
226 , _scst_failed = Just 0
227 , _scst_remaining = Just $ length docs - idx
228 , _scst_events = Just []
229 }
230 pure id
231 ) (zip [1..] docs)
232 flowCorpusUser (la ^. tt_lang) u cn c (concat ids) mfslw
233
234 ------------------------------------------------------------------------
235 flowCorpusUser :: ( FlowCmdM env err m
236 , MkCorpus c
237 )
238 => Lang
239 -> User
240 -> Either CorpusName [CorpusId]
241 -> Maybe c
242 -> [NodeId]
243 -> Maybe FlowSocialListWith
244 -> m CorpusId
245 flowCorpusUser l user corpusName ctype ids mfslw = do
246 -- User Flow
247 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
248 -- NodeTexts is first
249 _tId <- insertDefaultNode NodeTexts userCorpusId userId
250 -- printDebug "NodeTexts: " tId
251
252 -- NodeList is second
253 listId <- getOrMkList userCorpusId userId
254 -- _cooc <- insertDefaultNode NodeListCooc listId userId
255 -- TODO: check if present already, ignore
256 _ <- Doc.add userCorpusId ids
257
258 -- printDebug "Node Text Ids:" tId
259
260 -- User List Flow
261 (masterUserId, _masterRootId, masterCorpusId)
262 <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
263
264 --let gp = (GroupParams l 2 3 (StopSize 3))
265 let gp = GroupWithPosTag l CoreNLP HashMap.empty
266 ngs <- buildNgramsLists user userCorpusId masterCorpusId mfslw gp
267
268 _userListId <- flowList_DbRepo listId ngs
269 _mastListId <- getOrMkList masterCorpusId masterUserId
270 -- _ <- insertOccsUpdates userCorpusId mastListId
271 -- printDebug "userListId" userListId
272 -- User Graph Flow
273 _ <- insertDefaultNode NodeDashboard userCorpusId userId
274 _ <- insertDefaultNode NodeGraph userCorpusId userId
275 --_ <- mkPhylo userCorpusId userId
276 -- Annuaire Flow
277 -- _ <- mkAnnuaire rootUserId userId
278 pure userCorpusId
279
280
281 insertMasterDocs :: ( FlowCmdM env err m
282 , FlowCorpus a
283 , MkCorpus c
284 )
285 => Maybe c
286 -> TermType Lang
287 -> [a]
288 -> m [DocId]
289 insertMasterDocs c lang hs = do
290 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
291 (ids', documentsWithId) <- insertDocs masterUserId masterCorpusId (map (toNode masterUserId masterCorpusId) hs )
292 _ <- Doc.add masterCorpusId ids'
293 -- TODO
294 -- create a corpus with database name (CSV or PubMed)
295 -- add documents to the corpus (create node_node link)
296 -- this will enable global database monitoring
297
298 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
299 mapNgramsDocs' :: HashMap ExtractedNgrams (Map NgramsType (Map NodeId Int))
300 <- mapNodeIdNgrams
301 <$> documentIdWithNgrams
302 (extractNgramsT $ withLang lang documentsWithId)
303 documentsWithId
304
305 lId <- getOrMkList masterCorpusId masterUserId
306 _ <- saveDocNgramsWith lId mapNgramsDocs'
307
308 -- _cooc <- insertDefaultNode NodeListCooc lId masterUserId
309 pure ids'
310
311 saveDocNgramsWith :: ( FlowCmdM env err m)
312 => ListId
313 -> HashMap ExtractedNgrams (Map NgramsType (Map NodeId Int))
314 -> m ()
315 saveDocNgramsWith lId mapNgramsDocs' = do
316 terms2id <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
317 let mapNgramsDocs = HashMap.mapKeys extracted2ngrams mapNgramsDocs'
318
319 -- to be removed
320 let indexedNgrams = HashMap.mapKeys (indexNgrams terms2id) mapNgramsDocs
321
322 -- new
323 mapCgramsId <- listInsertDb lId toNodeNgramsW'
324 $ map (first _ngramsTerms . second Map.keys)
325 $ HashMap.toList mapNgramsDocs
326
327 -- insertDocNgrams
328 _return <- insertNodeNodeNgrams2
329 $ catMaybes [ NodeNodeNgrams2 <$> Just nId
330 <*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms'')
331 <*> Just (fromIntegral w :: Double)
332 | (terms'', mapNgramsTypes) <- HashMap.toList mapNgramsDocs
333 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
334 , (nId, w) <- Map.toList mapNodeIdWeight
335 ]
336 -- to be removed
337 _ <- insertDocNgrams lId indexedNgrams
338
339 pure ()
340
341
342 ------------------------------------------------------------------------
343 -- TODO Type NodeDocumentUnicised
344 insertDocs :: ( FlowCmdM env err m
345 -- , FlowCorpus a
346 , FlowInsertDB a
347 )
348 => UserId
349 -> CorpusId
350 -> [a]
351 -> m ([DocId], [Indexed NodeId a])
352 insertDocs uId cId hs = do
353 let docs = map addUniqId hs
354 newIds <- insertDb uId cId docs
355 -- printDebug "newIds" newIds
356 let
357 newIds' = map reId newIds
358 documentsWithId = mergeData (toInserted newIds) (Map.fromList $ map viewUniqId' docs)
359 _ <- Doc.add cId newIds'
360 pure (newIds', documentsWithId)
361
362
363 ------------------------------------------------------------------------
364 viewUniqId' :: UniqId a
365 => a
366 -> (Hash, a)
367 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
368 where
369 err = panic "[ERROR] Database.Flow.toInsert"
370
371
372 toInserted :: [ReturnId]
373 -> Map Hash ReturnId
374 toInserted =
375 Map.fromList . map (\r -> (reUniqId r, r) )
376 . filter (\r -> reInserted r == True)
377
378 mergeData :: Map Hash ReturnId
379 -> Map Hash a
380 -> [Indexed NodeId a]
381 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
382 where
383 toDocumentWithId (sha,hpd) =
384 Indexed <$> fmap reId (lookup sha rs)
385 <*> Just hpd
386
387 ------------------------------------------------------------------------
388 ------------------------------------------------------------------------
389 ------------------------------------------------------------------------
390 documentIdWithNgrams :: HasNodeError err
391 => (a
392 -> Cmd err (HashMap b (Map NgramsType Int)))
393 -> [Indexed NodeId a]
394 -> Cmd err [DocumentIdWithNgrams a b]
395 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
396 where
397 toDocumentIdWithNgrams d = do
398 e <- f $ _unIndex d
399 pure $ DocumentIdWithNgrams d e
400
401
402 -- | TODO check optimization
403 mapNodeIdNgrams :: (Ord b, Hashable b)
404 => [DocumentIdWithNgrams a b]
405 -> HashMap b
406 (Map NgramsType
407 (Map NodeId Int)
408 )
409 mapNodeIdNgrams = HashMap.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
410 where
411 f :: DocumentIdWithNgrams a b
412 -> HashMap b (Map NgramsType (Map NodeId Int))
413 f d = fmap (fmap (Map.singleton nId)) $ documentNgrams d
414 where
415 nId = _index $ documentWithId d
416
417
418 ------------------------------------------------------------------------
419 instance ExtractNgramsT HyperdataContact
420 where
421 extractNgramsT l hc = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extract l hc
422 where
423 extract :: TermType Lang -> HyperdataContact
424 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
425 extract _l hc' = do
426 let authors = map text2ngrams
427 $ maybe ["Nothing"] (\a -> [a])
428 $ view (hc_who . _Just . cw_lastName) hc'
429
430 pure $ HashMap.fromList $ [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ]
431
432
433 instance ExtractNgramsT HyperdataDocument
434 where
435 extractNgramsT :: TermType Lang
436 -> HyperdataDocument
437 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
438 extractNgramsT lang hd = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extractNgramsT' lang hd
439 where
440 extractNgramsT' :: TermType Lang
441 -> HyperdataDocument
442 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
443 extractNgramsT' lang' doc = do
444 let source = text2ngrams
445 $ maybe "Nothing" identity
446 $ _hd_source doc
447
448 institutes = map text2ngrams
449 $ maybe ["Nothing"] (map toSchoolName . (T.splitOn ", "))
450 $ _hd_institutes doc
451
452 authors = map text2ngrams
453 $ maybe ["Nothing"] (T.splitOn ", ")
454 $ _hd_authors doc
455
456 terms' <- map (enrichedTerms (lang' ^. tt_lang) CoreNLP NP)
457 <$> concat
458 <$> liftBase (extractTerms lang' $ hasText doc)
459
460 pure $ HashMap.fromList
461 $ [(SimpleNgrams source, Map.singleton Sources 1) ]
462 <> [(SimpleNgrams i', Map.singleton Institutes 1) | i' <- institutes ]
463 <> [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ]
464 <> [(EnrichedNgrams t', Map.singleton NgramsTerms 1) | t' <- terms' ]
465
466 instance (ExtractNgramsT a, HasText a) => ExtractNgramsT (Node a)
467 where
468 extractNgramsT l (Node _ _ _ _ _ _ _ h) = extractNgramsT l h
469
470 instance HasText a => HasText (Node a)
471 where
472 hasText (Node _ _ _ _ _ _ _ h) = hasText h
473
474
475
476 -- | TODO putelsewhere
477 -- | Upgrade function
478 -- Suppose all documents are English (this is the case actually)
479 indexAllDocumentsWithPosTag :: FlowCmdM env err m => m ()
480 indexAllDocumentsWithPosTag = do
481 rootId <- getRootId (UserName userMaster)
482 corpusIds <- findNodesId rootId [NodeCorpus]
483 docs <- List.concat <$> mapM getDocumentsWithParentId corpusIds
484
485 _ <- mapM extractInsert (splitEvery 1000 docs)
486
487 pure ()
488
489 extractInsert :: FlowCmdM env err m => [Node HyperdataDocument] -> m ()
490 extractInsert docs = do
491 let documentsWithId = map (\doc -> Indexed (doc ^. node_id) doc) docs
492
493 mapNgramsDocs' <- mapNodeIdNgrams
494 <$> documentIdWithNgrams
495 (extractNgramsT $ withLang (Multi EN) documentsWithId)
496 documentsWithId
497
498 _ <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
499
500 pure ()
501
502