]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Flow.hs
[DOC]
[gargantext.git] / src / Gargantext / Database / Action / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -- TODO-ACCESS:
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
13
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
16 -}
17
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
19
20 {-# LANGUAGE ConstrainedClassMethods #-}
21 {-# LANGUAGE ConstraintKinds #-}
22 {-# LANGUAGE InstanceSigs #-}
23 {-# LANGUAGE ScopedTypeVariables #-}
24 {-# LANGUAGE TemplateHaskell #-}
25
26 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
27 ( DataText(..)
28 , getDataText
29 , getDataText_Debug
30 , flowDataText
31 , flow
32
33 , flowCorpusFile
34 , flowCorpus
35 , flowAnnuaire
36 , insertMasterDocs
37 , saveDocNgramsWith
38
39 , getOrMkRoot
40 , getOrMk_RootWithCorpus
41 , TermType(..)
42 , DataOrigin(..)
43 , allDataOrigins
44
45 , do_api
46 , indexAllDocumentsWithPosTag
47 )
48 where
49
50 import Conduit
51 import Control.Lens ((^.), view, _Just, makeLenses)
52 import Data.Aeson.TH (deriveJSON)
53 import Data.Conduit.Internal (zipSources)
54 import Data.Either
55 import Data.HashMap.Strict (HashMap)
56 import Data.Hashable (Hashable)
57 import Data.List (concat)
58 import Data.Map (Map, lookup)
59 import Data.Maybe (catMaybes)
60 import Data.Monoid
61 import Data.Swagger
62 import qualified Data.Text as T
63 import Data.Traversable (traverse)
64 import Data.Tuple.Extra (first, second)
65 import GHC.Generics (Generic)
66 import Servant.Client (ClientError)
67 import System.FilePath (FilePath)
68 import qualified Data.HashMap.Strict as HashMap
69 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
70 import qualified Data.Map as Map
71 import qualified Data.Conduit.List as CL
72 import qualified Data.Conduit as C
73
74 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..))
75 import Gargantext.Core (Lang(..), PosTagAlgo(..))
76 import Gargantext.Core.Ext.IMT (toSchoolName)
77 import Gargantext.Core.Ext.IMTUser (readFile_Annuaire)
78 import Gargantext.Core.Flow.Types
79 import Gargantext.Core.Text
80 import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat, FileType)
81 import Gargantext.Core.Text.List (buildNgramsLists)
82 import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
83 import Gargantext.Core.Text.List.Social (FlowSocialListWith)
84 import Gargantext.Core.Text.Terms
85 import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
86 import Gargantext.Core.Types (POS(NP))
87 import Gargantext.Core.Types.Individu (User(..))
88 import Gargantext.Core.Types.Main
89 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
90 import Gargantext.Database.Action.Flow.List
91 import Gargantext.Database.Action.Flow.Types
92 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams, DocumentIdWithNgrams(..))
93 import Gargantext.Database.Action.Search (searchDocInDatabase)
94 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
95 import Gargantext.Database.Action.Metrics (updateNgramsOccurrences)
96 import Gargantext.Database.Admin.Types.Hyperdata
97 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
98 import Gargantext.Database.Prelude
99 import Gargantext.Database.Query.Table.ContextNodeNgrams2
100 import Gargantext.Database.Query.Table.Ngrams
101 import Gargantext.Database.Query.Table.Node
102 import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
103 import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
104 import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
105 import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
106 import Gargantext.Database.Schema.Node (NodePoly(..), node_id)
107 import Gargantext.Database.Types
108 import Gargantext.Prelude
109 import Gargantext.Prelude.Crypto.Hash (Hash)
110 import qualified Gargantext.Core.Text.Corpus.API as API
111 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
112 import qualified Prelude
113
114 ------------------------------------------------------------------------
115 -- Imports for upgrade function
116 import Gargantext.Database.Query.Tree.Root (getRootId)
117 import Gargantext.Database.Query.Tree (findNodesId)
118 import qualified Data.List as List
119 ------------------------------------------------------------------------
120 -- TODO use internal with API name (could be old data)
121 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
122 | ExternalOrigin { _do_api :: API.ExternalAPIs }
123 -- TODO Web
124 deriving (Generic, Eq)
125
126 makeLenses ''DataOrigin
127 deriveJSON (unPrefix "_do_") ''DataOrigin
128 instance ToSchema DataOrigin where
129 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
130
131 allDataOrigins :: [DataOrigin]
132 allDataOrigins = map InternalOrigin API.externalAPIs
133 <> map ExternalOrigin API.externalAPIs
134
135 ---------------
136 data DataText = DataOld ![NodeId]
137 | DataNew !(Maybe Integer, ConduitT () HyperdataDocument IO ())
138 -- | DataNew ![[HyperdataDocument]]
139
140 -- Show instance is not possible because of IO
141 printDataText :: DataText -> IO ()
142 printDataText (DataOld xs) = putStrLn $ show xs
143 printDataText (DataNew (maybeInt, conduitData)) = do
144 res <- C.runConduit (conduitData .| CL.consume)
145 putStrLn $ show (maybeInt, res)
146
147 -- TODO use the split parameter in config file
148 getDataText :: FlowCmdM env err m
149 => DataOrigin
150 -> TermType Lang
151 -> API.Query
152 -> Maybe API.Limit
153 -> m (Either ClientError DataText)
154 getDataText (ExternalOrigin api) la q li = liftBase $ do
155 eRes <- API.get api (_tt_lang la) q li
156 pure $ DataNew <$> eRes
157
158 getDataText (InternalOrigin _) _la q _li = do
159 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
160 (UserName userMaster)
161 (Left "")
162 (Nothing :: Maybe HyperdataCorpus)
163 ids <- map fst <$> searchDocInDatabase cId (stemIt q)
164 pure $ Right $ DataOld ids
165
166 getDataText_Debug :: FlowCmdM env err m
167 => DataOrigin
168 -> TermType Lang
169 -> API.Query
170 -> Maybe API.Limit
171 -> m ()
172 getDataText_Debug a l q li = do
173 result <- getDataText a l q li
174 case result of
175 Left err -> liftBase $ putStrLn $ show err
176 Right res -> liftBase $ printDataText res
177
178
179
180 -------------------------------------------------------------------------------
181 flowDataText :: forall env err m.
182 ( FlowCmdM env err m
183 )
184 => User
185 -> DataText
186 -> TermType Lang
187 -> CorpusId
188 -> Maybe FlowSocialListWith
189 -> (JobLog -> m ())
190 -> m CorpusId
191 flowDataText u (DataOld ids) tt cid mfslw _ = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids mfslw
192 where
193 corpusType = (Nothing :: Maybe HyperdataCorpus)
194 flowDataText u (DataNew (mLen, txtC)) tt cid mfslw logStatus =
195 flowCorpus u (Right [cid]) tt mfslw (mLen, (transPipe liftBase txtC)) logStatus
196
197 ------------------------------------------------------------------------
198 -- TODO use proxy
199 flowAnnuaire :: (FlowCmdM env err m)
200 => User
201 -> Either CorpusName [CorpusId]
202 -> (TermType Lang)
203 -> FilePath
204 -> (JobLog -> m ())
205 -> m AnnuaireId
206 flowAnnuaire u n l filePath logStatus = do
207 -- TODO Conduit for file
208 docs <- liftBase $ ((readFile_Annuaire filePath) :: IO [HyperdataContact])
209 flow (Nothing :: Maybe HyperdataAnnuaire) u n l Nothing (Just $ fromIntegral $ length docs, yieldMany docs) logStatus
210
211 ------------------------------------------------------------------------
212 flowCorpusFile :: (FlowCmdM env err m)
213 => User
214 -> Either CorpusName [CorpusId]
215 -> Limit -- Limit the number of docs (for dev purpose)
216 -> TermType Lang
217 -> FileType
218 -> FileFormat
219 -> FilePath
220 -> Maybe FlowSocialListWith
221 -> (JobLog -> m ())
222 -> m CorpusId
223 flowCorpusFile u n _l la ft ff fp mfslw logStatus = do
224 eParsed <- liftBase $ parseFile ft ff fp
225 case eParsed of
226 Right parsed -> do
227 flowCorpus u n la mfslw (Just $ fromIntegral $ length parsed, yieldMany parsed .| mapC toHyperdataDocument) logStatus
228 --let docs = splitEvery 500 $ take l parsed
229 --flowCorpus u n la mfslw (yieldMany $ map (map toHyperdataDocument) docs) logStatus
230 Left e -> panic $ "Error: " <> T.pack e
231
232 ------------------------------------------------------------------------
233 -- | TODO improve the needed type to create/update a corpus
234 -- (For now, Either is enough)
235 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
236 => User
237 -> Either CorpusName [CorpusId]
238 -> TermType Lang
239 -> Maybe FlowSocialListWith
240 -> (Maybe Integer, ConduitT () a m ())
241 -> (JobLog -> m ())
242 -> m CorpusId
243 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
244
245
246 flow :: forall env err m a c.
247 ( FlowCmdM env err m
248 , FlowCorpus a
249 , MkCorpus c
250 )
251 => Maybe c
252 -> User
253 -> Either CorpusName [CorpusId]
254 -> TermType Lang
255 -> Maybe FlowSocialListWith
256 -> (Maybe Integer, ConduitT () a m ())
257 -> (JobLog -> m ())
258 -> m CorpusId
259 flow c u cn la mfslw (mLength, docsC) logStatus = do
260 -- TODO if public insertMasterDocs else insertUserDocs
261 ids <- runConduit $
262 zipSources (yieldMany [1..]) docsC
263 .| mapMC insertDoc
264 .| sinkList
265 -- ids <- traverse (\(idx, doc) -> do
266 -- id <- insertMasterDocs c la doc
267 -- logStatus JobLog { _scst_succeeded = Just $ 1 + idx
268 -- , _scst_failed = Just 0
269 -- , _scst_remaining = Just $ length docs - idx
270 -- , _scst_events = Just []
271 -- }
272 -- pure id
273 -- ) (zip [1..] docs)
274 flowCorpusUser (la ^. tt_lang) u cn c ids mfslw
275
276 where
277 insertDoc :: (Integer, a) -> m NodeId
278 insertDoc (idx, doc) = do
279 id <- insertMasterDocs c la [doc]
280 case mLength of
281 Nothing -> pure ()
282 Just len -> do
283 logStatus JobLog { _scst_succeeded = Just $ fromIntegral $ 1 + idx
284 , _scst_failed = Just 0
285 , _scst_remaining = Just $ fromIntegral $ len - idx
286 , _scst_events = Just []
287 }
288 pure $ Prelude.head id
289
290
291
292 ------------------------------------------------------------------------
293 flowCorpusUser :: ( FlowCmdM env err m
294 , MkCorpus c
295 )
296 => Lang
297 -> User
298 -> Either CorpusName [CorpusId]
299 -> Maybe c
300 -> [NodeId]
301 -> Maybe FlowSocialListWith
302 -> m CorpusId
303 flowCorpusUser l user corpusName ctype ids mfslw = do
304 -- User Flow
305 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
306 -- NodeTexts is first
307 _tId <- insertDefaultNodeIfNotExists NodeTexts userCorpusId userId
308 -- printDebug "NodeTexts: " tId
309
310 -- NodeList is second
311 listId <- getOrMkList userCorpusId userId
312 -- _cooc <- insertDefaultNode NodeListCooc listId userId
313 -- TODO: check if present already, ignore
314 _ <- Doc.add userCorpusId ids
315
316 -- printDebug "Node Text Ids:" tId
317
318 -- User List Flow
319 (masterUserId, _masterRootId, masterCorpusId)
320 <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
321
322 --let gp = (GroupParams l 2 3 (StopSize 3))
323 -- Here the PosTagAlgo should be chosen according the Lang
324 let gp = GroupWithPosTag l CoreNLP HashMap.empty
325 ngs <- buildNgramsLists user userCorpusId masterCorpusId mfslw gp
326
327 -- printDebug "flowCorpusUser:ngs" ngs
328
329 _userListId <- flowList_DbRepo listId ngs
330 _mastListId <- getOrMkList masterCorpusId masterUserId
331 -- _ <- insertOccsUpdates userCorpusId mastListId
332 -- printDebug "userListId" userListId
333 -- User Graph Flow
334 _ <- insertDefaultNodeIfNotExists NodeDashboard userCorpusId userId
335 _ <- insertDefaultNodeIfNotExists NodeGraph userCorpusId userId
336 --_ <- mkPhylo userCorpusId userId
337 -- Annuaire Flow
338 -- _ <- mkAnnuaire rootUserId userId
339 _ <- updateNgramsOccurrences userCorpusId (Just listId)
340
341 pure userCorpusId
342
343
344 insertMasterDocs :: ( FlowCmdM env err m
345 , FlowCorpus a
346 , MkCorpus c
347 )
348 => Maybe c
349 -> TermType Lang
350 -> [a]
351 -> m [DocId]
352 insertMasterDocs c lang hs = do
353 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
354 (ids', documentsWithId) <- insertDocs masterUserId masterCorpusId (map (toNode masterUserId masterCorpusId) hs )
355 _ <- Doc.add masterCorpusId ids'
356 -- TODO
357 -- create a corpus with database name (CSV or PubMed)
358 -- add documents to the corpus (create node_node link)
359 -- this will enable global database monitoring
360
361 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
362 mapNgramsDocs' :: HashMap ExtractedNgrams (Map NgramsType (Map NodeId Int))
363 <- mapNodeIdNgrams
364 <$> documentIdWithNgrams
365 (extractNgramsT $ withLang lang documentsWithId)
366 documentsWithId
367
368 lId <- getOrMkList masterCorpusId masterUserId
369 _ <- saveDocNgramsWith lId mapNgramsDocs'
370
371 -- _cooc <- insertDefaultNode NodeListCooc lId masterUserId
372 pure ids'
373
374 saveDocNgramsWith :: ( FlowCmdM env err m)
375 => ListId
376 -> HashMap ExtractedNgrams (Map NgramsType (Map NodeId Int))
377 -> m ()
378 saveDocNgramsWith lId mapNgramsDocs' = do
379 terms2id <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
380 --printDebug "terms2id" terms2id
381
382 let mapNgramsDocs = HashMap.mapKeys extracted2ngrams mapNgramsDocs'
383
384 -- new
385 mapCgramsId <- listInsertDb lId toNodeNgramsW'
386 $ map (first _ngramsTerms . second Map.keys)
387 $ HashMap.toList mapNgramsDocs
388
389 --printDebug "saveDocNgramsWith" mapCgramsId
390 -- insertDocNgrams
391 _return <- insertContextNodeNgrams2
392 $ catMaybes [ ContextNodeNgrams2 <$> Just nId
393 <*> (getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms''))
394 <*> Just (fromIntegral w :: Double)
395 | (terms'', mapNgramsTypes) <- HashMap.toList mapNgramsDocs
396 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
397 , (nId, w) <- Map.toList mapNodeIdWeight
398 ]
399
400 -- to be removed
401 _ <- insertDocNgrams lId $ HashMap.mapKeys (indexNgrams terms2id) mapNgramsDocs
402
403 pure ()
404
405
406 ------------------------------------------------------------------------
407 -- TODO Type NodeDocumentUnicised
408 insertDocs :: ( FlowCmdM env err m
409 -- , FlowCorpus a
410 , FlowInsertDB a
411 )
412 => UserId
413 -> CorpusId
414 -> [a]
415 -> m ([ContextId], [Indexed ContextId a])
416 insertDocs uId cId hs = do
417 let docs = map addUniqId hs
418 newIds <- insertDb uId cId docs
419 -- printDebug "newIds" newIds
420 let
421 newIds' = map reId newIds
422 documentsWithId = mergeData (toInserted newIds) (Map.fromList $ map viewUniqId' docs)
423 _ <- Doc.add cId newIds'
424 pure (newIds', documentsWithId)
425
426
427 ------------------------------------------------------------------------
428 viewUniqId' :: UniqId a
429 => a
430 -> (Hash, a)
431 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
432 where
433 err = panic "[ERROR] Database.Flow.toInsert"
434
435
436 toInserted :: [ReturnId]
437 -> Map Hash ReturnId
438 toInserted =
439 Map.fromList . map (\r -> (reUniqId r, r) )
440 . filter (\r -> reInserted r == True)
441
442 mergeData :: Map Hash ReturnId
443 -> Map Hash a
444 -> [Indexed NodeId a]
445 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
446 where
447 toDocumentWithId (sha,hpd) =
448 Indexed <$> fmap reId (lookup sha rs)
449 <*> Just hpd
450
451 ------------------------------------------------------------------------
452 ------------------------------------------------------------------------
453 ------------------------------------------------------------------------
454 documentIdWithNgrams :: HasNodeError err
455 => (a
456 -> Cmd err (HashMap b (Map NgramsType Int)))
457 -> [Indexed NodeId a]
458 -> Cmd err [DocumentIdWithNgrams a b]
459 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
460 where
461 toDocumentIdWithNgrams d = do
462 e <- f $ _unIndex d
463 pure $ DocumentIdWithNgrams d e
464
465
466 -- | TODO check optimization
467 mapNodeIdNgrams :: (Ord b, Hashable b)
468 => [DocumentIdWithNgrams a b]
469 -> HashMap b
470 (Map NgramsType
471 (Map NodeId Int)
472 )
473 mapNodeIdNgrams = HashMap.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
474 where
475 f :: DocumentIdWithNgrams a b
476 -> HashMap b (Map NgramsType (Map NodeId Int))
477 f d = fmap (fmap (Map.singleton nId)) $ documentNgrams d
478 where
479 nId = _index $ documentWithId d
480
481
482 ------------------------------------------------------------------------
483 instance ExtractNgramsT HyperdataContact
484 where
485 extractNgramsT l hc = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extract l hc
486 where
487 extract :: TermType Lang -> HyperdataContact
488 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
489 extract _l hc' = do
490 let authors = map text2ngrams
491 $ maybe ["Nothing"] (\a -> [a])
492 $ view (hc_who . _Just . cw_lastName) hc'
493
494 pure $ HashMap.fromList $ [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ]
495
496
497 instance ExtractNgramsT HyperdataDocument
498 where
499 extractNgramsT :: TermType Lang
500 -> HyperdataDocument
501 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
502 extractNgramsT lang hd = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extractNgramsT' lang hd
503 where
504 extractNgramsT' :: TermType Lang
505 -> HyperdataDocument
506 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
507 extractNgramsT' lang' doc = do
508 let source = text2ngrams
509 $ maybe "Nothing" identity
510 $ _hd_source doc
511
512 institutes = map text2ngrams
513 $ maybe ["Nothing"] (map toSchoolName . (T.splitOn ", "))
514 $ _hd_institutes doc
515
516 authors = map text2ngrams
517 $ maybe ["Nothing"] (T.splitOn ", ")
518 $ _hd_authors doc
519
520 terms' <- map (enrichedTerms (lang' ^. tt_lang) CoreNLP NP)
521 <$> concat
522 <$> liftBase (extractTerms lang' $ hasText doc)
523
524 pure $ HashMap.fromList
525 $ [(SimpleNgrams source, Map.singleton Sources 1) ]
526 <> [(SimpleNgrams i', Map.singleton Institutes 1) | i' <- institutes ]
527 <> [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ]
528 <> [(EnrichedNgrams t', Map.singleton NgramsTerms 1) | t' <- terms' ]
529
530 instance (ExtractNgramsT a, HasText a) => ExtractNgramsT (Node a)
531 where
532 extractNgramsT l (Node _ _ _ _ _ _ _ h) = extractNgramsT l h
533
534 instance HasText a => HasText (Node a)
535 where
536 hasText (Node _ _ _ _ _ _ _ h) = hasText h
537
538
539
540 -- | TODO putelsewhere
541 -- | Upgrade function
542 -- Suppose all documents are English (this is the case actually)
543 indexAllDocumentsWithPosTag :: FlowCmdM env err m
544 => m ()
545 indexAllDocumentsWithPosTag = do
546 rootId <- getRootId (UserName userMaster)
547 corpusIds <- findNodesId rootId [NodeCorpus]
548 docs <- List.concat <$> mapM getDocumentsWithParentId corpusIds
549 _ <- mapM extractInsert (splitEvery 1000 docs)
550 pure ()
551
552 extractInsert :: FlowCmdM env err m
553 => [Node HyperdataDocument] -> m ()
554 extractInsert docs = do
555 let documentsWithId = map (\doc -> Indexed (doc ^. node_id) doc) docs
556 mapNgramsDocs' <- mapNodeIdNgrams
557 <$> documentIdWithNgrams
558 (extractNgramsT $ withLang (Multi EN) documentsWithId)
559 documentsWithId
560 _ <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
561 pure ()