2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
20 {-# LANGUAGE ConstrainedClassMethods #-}
21 {-# LANGUAGE ConstraintKinds #-}
22 {-# LANGUAGE InstanceSigs #-}
23 {-# LANGUAGE ScopedTypeVariables #-}
24 {-# LANGUAGE TemplateHaskell #-}
26 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
39 , getOrMk_RootWithCorpus
45 , indexAllDocumentsWithPosTag
49 import Control.Lens ((^.), view, _Just, makeLenses)
50 import Data.Aeson.TH (deriveJSON)
52 import Data.HashMap.Strict (HashMap)
53 import Data.Hashable (Hashable)
54 import Data.List (concat)
55 import Data.Map (Map, lookup)
56 import Data.Maybe (catMaybes)
59 import qualified Data.Text as T
60 import Data.Traversable (traverse)
61 import Data.Tuple.Extra (first, second)
62 import GHC.Generics (Generic)
63 import System.FilePath (FilePath)
64 import qualified Data.HashMap.Strict as HashMap
65 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
66 import qualified Data.Map as Map
68 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..))
69 import Gargantext.Core (Lang(..), PosTagAlgo(..))
70 import Gargantext.Core.Ext.IMT (toSchoolName)
71 import Gargantext.Core.Ext.IMTUser (readFile_Annuaire)
72 import Gargantext.Core.Flow.Types
73 import Gargantext.Core.Text
74 import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat)
75 import Gargantext.Core.Text.List (buildNgramsLists)
76 import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
77 import Gargantext.Core.Text.List.Social (FlowSocialListWith)
78 import Gargantext.Core.Text.Terms
79 import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
80 import Gargantext.Core.Types (POS(NP))
81 import Gargantext.Core.Types.Individu (User(..))
82 import Gargantext.Core.Types.Main
83 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
84 import Gargantext.Database.Action.Flow.List
85 import Gargantext.Database.Action.Flow.Types
86 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams, DocumentIdWithNgrams(..))
87 import Gargantext.Database.Action.Search (searchDocInDatabase)
88 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
89 import Gargantext.Database.Action.Metrics (updateNgramsOccurrences)
90 import Gargantext.Database.Admin.Types.Hyperdata
91 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
92 import Gargantext.Database.Prelude
93 import Gargantext.Database.Query.Table.ContextNodeNgrams2
94 import Gargantext.Database.Query.Table.Ngrams
95 import Gargantext.Database.Query.Table.Node
96 import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
97 import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
98 import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
99 import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
100 import Gargantext.Database.Schema.Node (NodePoly(..), node_id)
101 import Gargantext.Database.Types
102 import Gargantext.Prelude
103 import Gargantext.Prelude.Crypto.Hash (Hash)
104 import qualified Gargantext.Core.Text.Corpus.API as API
105 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
107 ------------------------------------------------------------------------
108 -- Imports for upgrade function
109 import Gargantext.Database.Query.Tree.Root (getRootId)
110 import Gargantext.Database.Query.Tree (findNodesId)
111 import qualified Data.List as List
112 ------------------------------------------------------------------------
113 -- TODO use internal with API name (could be old data)
114 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
115 | ExternalOrigin { _do_api :: API.ExternalAPIs }
117 deriving (Generic, Eq)
119 makeLenses ''DataOrigin
120 deriveJSON (unPrefix "_do_") ''DataOrigin
121 instance ToSchema DataOrigin where
122 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
124 allDataOrigins :: [DataOrigin]
125 allDataOrigins = map InternalOrigin API.externalAPIs
126 <> map ExternalOrigin API.externalAPIs
129 data DataText = DataOld ![NodeId]
130 | DataNew ![[HyperdataDocument]]
132 -- TODO use the split parameter in config file
133 getDataText :: FlowCmdM env err m
139 getDataText (ExternalOrigin api) la q li = liftBase $ DataNew
141 <$> API.get api (_tt_lang la) q li
143 getDataText (InternalOrigin _) _la q _li = do
144 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
145 (UserName userMaster)
147 (Nothing :: Maybe HyperdataCorpus)
148 ids <- map fst <$> searchDocInDatabase cId (stemIt q)
151 -------------------------------------------------------------------------------
152 flowDataText :: ( FlowCmdM env err m
158 -> Maybe FlowSocialListWith
161 flowDataText u (DataOld ids) tt cid mfslw _ = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids mfslw
163 corpusType = (Nothing :: Maybe HyperdataCorpus)
164 flowDataText u (DataNew txt) tt cid mfslw logStatus = flowCorpus u (Right [cid]) tt mfslw txt logStatus
166 ------------------------------------------------------------------------
168 flowAnnuaire :: (FlowCmdM env err m)
170 -> Either CorpusName [CorpusId]
175 flowAnnuaire u n l filePath logStatus = do
176 docs <- liftBase $ (( splitEvery 500 <$> readFile_Annuaire filePath) :: IO [[HyperdataContact]])
177 flow (Nothing :: Maybe HyperdataAnnuaire) u n l Nothing docs logStatus
179 ------------------------------------------------------------------------
180 flowCorpusFile :: (FlowCmdM env err m)
182 -> Either CorpusName [CorpusId]
183 -> Limit -- Limit the number of docs (for dev purpose)
184 -> TermType Lang -> FileFormat -> FilePath
185 -> Maybe FlowSocialListWith
188 flowCorpusFile u n l la ff fp mfslw logStatus = do
189 eParsed <- liftBase $ parseFile ff fp
192 let docs = splitEvery 500 $ take l parsed
193 flowCorpus u n la mfslw (map (map toHyperdataDocument) docs) logStatus
194 Left e -> panic $ "Error: " <> (T.pack e)
196 ------------------------------------------------------------------------
197 -- | TODO improve the needed type to create/update a corpus
198 -- (For now, Either is enough)
199 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
201 -> Either CorpusName [CorpusId]
203 -> Maybe FlowSocialListWith
207 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
210 flow :: ( FlowCmdM env err m
216 -> Either CorpusName [CorpusId]
218 -> Maybe FlowSocialListWith
222 flow c u cn la mfslw docs logStatus = do
223 -- TODO if public insertMasterDocs else insertUserDocs
224 ids <- traverse (\(idx, doc) -> do
225 id <- insertMasterDocs c la doc
226 logStatus JobLog { _scst_succeeded = Just $ 1 + idx
227 , _scst_failed = Just 0
228 , _scst_remaining = Just $ length docs - idx
229 , _scst_events = Just []
233 flowCorpusUser (la ^. tt_lang) u cn c (concat ids) mfslw
238 ------------------------------------------------------------------------
239 flowCorpusUser :: ( FlowCmdM env err m
244 -> Either CorpusName [CorpusId]
247 -> Maybe FlowSocialListWith
249 flowCorpusUser l user corpusName ctype ids mfslw = do
251 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
252 -- NodeTexts is first
253 _tId <- insertDefaultNode NodeTexts userCorpusId userId
254 -- printDebug "NodeTexts: " tId
256 -- NodeList is second
257 listId <- getOrMkList userCorpusId userId
258 -- _cooc <- insertDefaultNode NodeListCooc listId userId
259 -- TODO: check if present already, ignore
260 _ <- Doc.add userCorpusId ids
262 -- printDebug "Node Text Ids:" tId
265 (masterUserId, _masterRootId, masterCorpusId)
266 <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
268 --let gp = (GroupParams l 2 3 (StopSize 3))
269 let gp = GroupWithPosTag l CoreNLP HashMap.empty
270 ngs <- buildNgramsLists user userCorpusId masterCorpusId mfslw gp
272 -- printDebug "flowCorpusUser:ngs" ngs
274 _userListId <- flowList_DbRepo listId ngs
275 _mastListId <- getOrMkList masterCorpusId masterUserId
276 -- _ <- insertOccsUpdates userCorpusId mastListId
277 -- printDebug "userListId" userListId
279 _ <- insertDefaultNode NodeDashboard userCorpusId userId
280 _ <- insertDefaultNode NodeGraph userCorpusId userId
281 --_ <- mkPhylo userCorpusId userId
283 -- _ <- mkAnnuaire rootUserId userId
284 _ <- updateNgramsOccurrences userCorpusId (Just listId)
289 insertMasterDocs :: ( FlowCmdM env err m
297 insertMasterDocs c lang hs = do
298 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
299 (ids', documentsWithId) <- insertDocs masterUserId masterCorpusId (map (toNode masterUserId masterCorpusId) hs )
300 _ <- Doc.add masterCorpusId ids'
302 -- create a corpus with database name (CSV or PubMed)
303 -- add documents to the corpus (create node_node link)
304 -- this will enable global database monitoring
306 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
307 mapNgramsDocs' :: HashMap ExtractedNgrams (Map NgramsType (Map NodeId Int))
309 <$> documentIdWithNgrams
310 (extractNgramsT $ withLang lang documentsWithId)
313 lId <- getOrMkList masterCorpusId masterUserId
314 _ <- saveDocNgramsWith lId mapNgramsDocs'
316 -- _cooc <- insertDefaultNode NodeListCooc lId masterUserId
319 saveDocNgramsWith :: ( FlowCmdM env err m)
321 -> HashMap ExtractedNgrams (Map NgramsType (Map NodeId Int))
323 saveDocNgramsWith lId mapNgramsDocs' = do
324 terms2id <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
325 printDebug "terms2id" terms2id
327 let mapNgramsDocs = HashMap.mapKeys extracted2ngrams mapNgramsDocs'
330 mapCgramsId <- listInsertDb lId toNodeNgramsW'
331 $ map (first _ngramsTerms . second Map.keys)
332 $ HashMap.toList mapNgramsDocs
334 printDebug "saveDocNgramsWith" mapCgramsId
336 _return <- insertContextNodeNgrams2
337 $ catMaybes [ ContextNodeNgrams2 <$> Just nId
338 <*> (getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms''))
339 <*> Just (fromIntegral w :: Double)
340 | (terms'', mapNgramsTypes) <- HashMap.toList mapNgramsDocs
341 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
342 , (nId, w) <- Map.toList mapNodeIdWeight
346 _ <- insertDocNgrams lId $ HashMap.mapKeys (indexNgrams terms2id) mapNgramsDocs
351 ------------------------------------------------------------------------
352 -- TODO Type NodeDocumentUnicised
353 insertDocs :: ( FlowCmdM env err m
360 -> m ([ContextId], [Indexed ContextId a])
361 insertDocs uId cId hs = do
362 let docs = map addUniqId hs
363 newIds <- insertDb uId cId docs
364 -- printDebug "newIds" newIds
366 newIds' = map reId newIds
367 documentsWithId = mergeData (toInserted newIds) (Map.fromList $ map viewUniqId' docs)
368 _ <- Doc.add cId newIds'
369 pure (newIds', documentsWithId)
372 ------------------------------------------------------------------------
373 viewUniqId' :: UniqId a
376 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
378 err = panic "[ERROR] Database.Flow.toInsert"
381 toInserted :: [ReturnId]
384 Map.fromList . map (\r -> (reUniqId r, r) )
385 . filter (\r -> reInserted r == True)
387 mergeData :: Map Hash ReturnId
389 -> [Indexed NodeId a]
390 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
392 toDocumentWithId (sha,hpd) =
393 Indexed <$> fmap reId (lookup sha rs)
396 ------------------------------------------------------------------------
397 ------------------------------------------------------------------------
398 ------------------------------------------------------------------------
399 documentIdWithNgrams :: HasNodeError err
401 -> Cmd err (HashMap b (Map NgramsType Int)))
402 -> [Indexed NodeId a]
403 -> Cmd err [DocumentIdWithNgrams a b]
404 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
406 toDocumentIdWithNgrams d = do
408 pure $ DocumentIdWithNgrams d e
411 -- | TODO check optimization
412 mapNodeIdNgrams :: (Ord b, Hashable b)
413 => [DocumentIdWithNgrams a b]
418 mapNodeIdNgrams = HashMap.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
420 f :: DocumentIdWithNgrams a b
421 -> HashMap b (Map NgramsType (Map NodeId Int))
422 f d = fmap (fmap (Map.singleton nId)) $ documentNgrams d
424 nId = _index $ documentWithId d
427 ------------------------------------------------------------------------
428 instance ExtractNgramsT HyperdataContact
430 extractNgramsT l hc = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extract l hc
432 extract :: TermType Lang -> HyperdataContact
433 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
435 let authors = map text2ngrams
436 $ maybe ["Nothing"] (\a -> [a])
437 $ view (hc_who . _Just . cw_lastName) hc'
439 pure $ HashMap.fromList $ [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ]
442 instance ExtractNgramsT HyperdataDocument
444 extractNgramsT :: TermType Lang
446 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
447 extractNgramsT lang hd = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extractNgramsT' lang hd
449 extractNgramsT' :: TermType Lang
451 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
452 extractNgramsT' lang' doc = do
453 let source = text2ngrams
454 $ maybe "Nothing" identity
457 institutes = map text2ngrams
458 $ maybe ["Nothing"] (map toSchoolName . (T.splitOn ", "))
461 authors = map text2ngrams
462 $ maybe ["Nothing"] (T.splitOn ", ")
465 terms' <- map (enrichedTerms (lang' ^. tt_lang) CoreNLP NP)
467 <$> liftBase (extractTerms lang' $ hasText doc)
469 pure $ HashMap.fromList
470 $ [(SimpleNgrams source, Map.singleton Sources 1) ]
471 <> [(SimpleNgrams i', Map.singleton Institutes 1) | i' <- institutes ]
472 <> [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ]
473 <> [(EnrichedNgrams t', Map.singleton NgramsTerms 1) | t' <- terms' ]
475 instance (ExtractNgramsT a, HasText a) => ExtractNgramsT (Node a)
477 extractNgramsT l (Node _ _ _ _ _ _ _ h) = extractNgramsT l h
479 instance HasText a => HasText (Node a)
481 hasText (Node _ _ _ _ _ _ _ h) = hasText h
485 -- | TODO putelsewhere
486 -- | Upgrade function
487 -- Suppose all documents are English (this is the case actually)
488 indexAllDocumentsWithPosTag :: FlowCmdM env err m
490 indexAllDocumentsWithPosTag = do
491 rootId <- getRootId (UserName userMaster)
492 corpusIds <- findNodesId rootId [NodeCorpus]
493 docs <- List.concat <$> mapM getDocumentsWithParentId corpusIds
494 _ <- mapM extractInsert (splitEvery 1000 docs)
497 extractInsert :: FlowCmdM env err m
498 => [Node HyperdataDocument] -> m ()
499 extractInsert docs = do
500 let documentsWithId = map (\doc -> Indexed (doc ^. node_id) doc) docs
501 mapNgramsDocs' <- mapNodeIdNgrams
502 <$> documentIdWithNgrams
503 (extractNgramsT $ withLang (Multi EN) documentsWithId)
505 _ <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'