2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
20 {-# LANGUAGE ConstrainedClassMethods #-}
21 {-# LANGUAGE ConstraintKinds #-}
22 {-# LANGUAGE InstanceSigs #-}
23 {-# LANGUAGE ScopedTypeVariables #-}
24 {-# LANGUAGE TemplateHaskell #-}
26 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
39 , getOrMk_RootWithCorpus
45 , indexAllDocumentsWithPosTag
49 import Control.Lens ((^.), view, _Just, makeLenses)
50 import Data.Aeson.TH (deriveJSON)
52 import Data.HashMap.Strict (HashMap)
53 import Data.Hashable (Hashable)
54 import Data.List (concat)
55 import Data.Map (Map, lookup)
56 import Data.Maybe (catMaybes)
59 import qualified Data.Text as T
60 import Data.Traversable (traverse)
61 import Data.Tuple.Extra (first, second)
62 import GHC.Generics (Generic)
63 import System.FilePath (FilePath)
64 import qualified Data.HashMap.Strict as HashMap
65 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
66 import qualified Data.Map as Map
68 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..))
69 import Gargantext.Core (Lang(..), PosTagAlgo(..))
70 import Gargantext.Core.Ext.IMT (toSchoolName)
71 import Gargantext.Core.Ext.IMTUser (readFile_Annuaire)
72 import Gargantext.Core.Flow.Types
73 import Gargantext.Core.Text
74 import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat)
75 import Gargantext.Core.Text.List (buildNgramsLists)
76 import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
77 import Gargantext.Core.Text.List.Social (FlowSocialListWith)
78 import Gargantext.Core.Text.Terms
79 import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
80 import Gargantext.Core.Types (POS(NP))
81 import Gargantext.Core.Types.Individu (User(..))
82 import Gargantext.Core.Types.Main
83 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
84 import Gargantext.Database.Action.Flow.List
85 import Gargantext.Database.Action.Flow.Types
86 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams, DocumentIdWithNgrams(..))
87 import Gargantext.Database.Action.Search (searchDocInDatabase)
88 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
89 import Gargantext.Database.Admin.Types.Hyperdata
90 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
91 import Gargantext.Database.Prelude
92 import Gargantext.Database.Query.Table.ContextNodeNgrams2
93 import Gargantext.Database.Query.Table.Ngrams
94 import Gargantext.Database.Query.Table.Node
95 import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
96 import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
97 import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
98 import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
99 import Gargantext.Database.Schema.Node (NodePoly(..), node_id)
100 import Gargantext.Database.Types
101 import Gargantext.Prelude
102 import Gargantext.Prelude.Crypto.Hash (Hash)
103 import qualified Gargantext.Core.Text.Corpus.API as API
104 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
106 ------------------------------------------------------------------------
107 -- Imports for upgrade function
108 import Gargantext.Database.Query.Tree.Root (getRootId)
109 import Gargantext.Database.Query.Tree (findNodesId)
110 import qualified Data.List as List
111 ------------------------------------------------------------------------
112 -- TODO use internal with API name (could be old data)
113 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
114 | ExternalOrigin { _do_api :: API.ExternalAPIs }
116 deriving (Generic, Eq)
118 makeLenses ''DataOrigin
119 deriveJSON (unPrefix "_do_") ''DataOrigin
120 instance ToSchema DataOrigin where
121 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
123 allDataOrigins :: [DataOrigin]
124 allDataOrigins = map InternalOrigin API.externalAPIs
125 <> map ExternalOrigin API.externalAPIs
128 data DataText = DataOld ![NodeId]
129 | DataNew ![[HyperdataDocument]]
131 -- TODO use the split parameter in config file
132 getDataText :: FlowCmdM env err m
138 getDataText (ExternalOrigin api) la q li = liftBase $ DataNew
140 <$> API.get api (_tt_lang la) q li
142 getDataText (InternalOrigin _) _la q _li = do
143 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
144 (UserName userMaster)
146 (Nothing :: Maybe HyperdataCorpus)
147 ids <- map fst <$> searchDocInDatabase cId (stemIt q)
150 -------------------------------------------------------------------------------
151 flowDataText :: ( FlowCmdM env err m
157 -> Maybe FlowSocialListWith
160 flowDataText u (DataOld ids) tt cid mfslw _ = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids mfslw
162 corpusType = (Nothing :: Maybe HyperdataCorpus)
163 flowDataText u (DataNew txt) tt cid mfslw logStatus = flowCorpus u (Right [cid]) tt mfslw txt logStatus
165 ------------------------------------------------------------------------
167 flowAnnuaire :: (FlowCmdM env err m)
169 -> Either CorpusName [CorpusId]
174 flowAnnuaire u n l filePath logStatus = do
175 docs <- liftBase $ (( splitEvery 500 <$> readFile_Annuaire filePath) :: IO [[HyperdataContact]])
176 flow (Nothing :: Maybe HyperdataAnnuaire) u n l Nothing docs logStatus
178 ------------------------------------------------------------------------
179 flowCorpusFile :: (FlowCmdM env err m)
181 -> Either CorpusName [CorpusId]
182 -> Limit -- Limit the number of docs (for dev purpose)
183 -> TermType Lang -> FileFormat -> FilePath
184 -> Maybe FlowSocialListWith
187 flowCorpusFile u n l la ff fp mfslw logStatus = do
188 eParsed <- liftBase $ parseFile ff fp
191 let docs = splitEvery 500 $ take l parsed
192 flowCorpus u n la mfslw (map (map toHyperdataDocument) docs) logStatus
193 Left e -> panic $ "Error: " <> (T.pack e)
195 ------------------------------------------------------------------------
196 -- | TODO improve the needed type to create/update a corpus
197 -- (For now, Either is enough)
198 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
200 -> Either CorpusName [CorpusId]
202 -> Maybe FlowSocialListWith
206 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
209 flow :: ( FlowCmdM env err m
215 -> Either CorpusName [CorpusId]
217 -> Maybe FlowSocialListWith
221 flow c u cn la mfslw docs logStatus = do
222 -- TODO if public insertMasterDocs else insertUserDocs
223 ids <- traverse (\(idx, doc) -> do
224 id <- insertMasterDocs c la doc
225 logStatus JobLog { _scst_succeeded = Just $ 1 + idx
226 , _scst_failed = Just 0
227 , _scst_remaining = Just $ length docs - idx
228 , _scst_events = Just []
232 flowCorpusUser (la ^. tt_lang) u cn c (concat ids) mfslw
237 ------------------------------------------------------------------------
238 flowCorpusUser :: ( FlowCmdM env err m
243 -> Either CorpusName [CorpusId]
246 -> Maybe FlowSocialListWith
248 flowCorpusUser l user corpusName ctype ids mfslw = do
250 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
251 -- NodeTexts is first
252 _tId <- insertDefaultNode NodeTexts userCorpusId userId
253 -- printDebug "NodeTexts: " tId
255 -- NodeList is second
256 listId <- getOrMkList userCorpusId userId
257 -- _cooc <- insertDefaultNode NodeListCooc listId userId
258 -- TODO: check if present already, ignore
259 _ <- Doc.add userCorpusId ids
261 -- printDebug "Node Text Ids:" tId
264 (masterUserId, _masterRootId, masterCorpusId)
265 <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
267 --let gp = (GroupParams l 2 3 (StopSize 3))
268 let gp = GroupWithPosTag l CoreNLP HashMap.empty
269 ngs <- buildNgramsLists user userCorpusId masterCorpusId mfslw gp
271 -- printDebug "flowCorpusUser:ngs" ngs
273 _userListId <- flowList_DbRepo listId ngs
274 _mastListId <- getOrMkList masterCorpusId masterUserId
275 -- _ <- insertOccsUpdates userCorpusId mastListId
276 -- printDebug "userListId" userListId
278 _ <- insertDefaultNode NodeDashboard userCorpusId userId
279 _ <- insertDefaultNode NodeGraph userCorpusId userId
280 --_ <- mkPhylo userCorpusId userId
282 -- _ <- mkAnnuaire rootUserId userId
286 insertMasterDocs :: ( FlowCmdM env err m
294 insertMasterDocs c lang hs = do
295 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
296 (ids', documentsWithId) <- insertDocs masterUserId masterCorpusId (map (toNode masterUserId masterCorpusId) hs )
297 _ <- Doc.add masterCorpusId ids'
299 -- create a corpus with database name (CSV or PubMed)
300 -- add documents to the corpus (create node_node link)
301 -- this will enable global database monitoring
303 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
304 mapNgramsDocs' :: HashMap ExtractedNgrams (Map NgramsType (Map NodeId Int))
306 <$> documentIdWithNgrams
307 (extractNgramsT $ withLang lang documentsWithId)
310 lId <- getOrMkList masterCorpusId masterUserId
311 _ <- saveDocNgramsWith lId mapNgramsDocs'
313 -- _cooc <- insertDefaultNode NodeListCooc lId masterUserId
316 saveDocNgramsWith :: ( FlowCmdM env err m)
318 -> HashMap ExtractedNgrams (Map NgramsType (Map NodeId Int))
320 saveDocNgramsWith lId mapNgramsDocs' = do
321 terms2id <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
322 let mapNgramsDocs = HashMap.mapKeys extracted2ngrams mapNgramsDocs'
325 mapCgramsId <- listInsertDb lId toNodeNgramsW'
326 $ map (first _ngramsTerms . second Map.keys)
327 $ HashMap.toList mapNgramsDocs
329 -- printDebug "saveDocNgramsWith" mapCgramsId
331 _return <- insertContextNodeNgrams2
332 $ catMaybes [ ContextNodeNgrams2 <$> Just nId
333 <*> (getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms''))
334 <*> Just (fromIntegral w :: Double)
335 | (terms'', mapNgramsTypes) <- HashMap.toList mapNgramsDocs
336 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
337 , (nId, w) <- Map.toList mapNodeIdWeight
341 _ <- insertDocNgrams lId $ HashMap.mapKeys (indexNgrams terms2id) mapNgramsDocs
346 ------------------------------------------------------------------------
347 -- TODO Type NodeDocumentUnicised
348 insertDocs :: ( FlowCmdM env err m
355 -> m ([ContextId], [Indexed ContextId a])
356 insertDocs uId cId hs = do
357 let docs = map addUniqId hs
358 newIds <- insertDb uId cId docs
359 -- printDebug "newIds" newIds
361 newIds' = map reId newIds
362 documentsWithId = mergeData (toInserted newIds) (Map.fromList $ map viewUniqId' docs)
363 _ <- Doc.add cId newIds'
364 pure (newIds', documentsWithId)
367 ------------------------------------------------------------------------
368 viewUniqId' :: UniqId a
371 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
373 err = panic "[ERROR] Database.Flow.toInsert"
376 toInserted :: [ReturnId]
379 Map.fromList . map (\r -> (reUniqId r, r) )
380 . filter (\r -> reInserted r == True)
382 mergeData :: Map Hash ReturnId
384 -> [Indexed NodeId a]
385 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
387 toDocumentWithId (sha,hpd) =
388 Indexed <$> fmap reId (lookup sha rs)
391 ------------------------------------------------------------------------
392 ------------------------------------------------------------------------
393 ------------------------------------------------------------------------
394 documentIdWithNgrams :: HasNodeError err
396 -> Cmd err (HashMap b (Map NgramsType Int)))
397 -> [Indexed NodeId a]
398 -> Cmd err [DocumentIdWithNgrams a b]
399 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
401 toDocumentIdWithNgrams d = do
403 pure $ DocumentIdWithNgrams d e
406 -- | TODO check optimization
407 mapNodeIdNgrams :: (Ord b, Hashable b)
408 => [DocumentIdWithNgrams a b]
413 mapNodeIdNgrams = HashMap.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
415 f :: DocumentIdWithNgrams a b
416 -> HashMap b (Map NgramsType (Map NodeId Int))
417 f d = fmap (fmap (Map.singleton nId)) $ documentNgrams d
419 nId = _index $ documentWithId d
422 ------------------------------------------------------------------------
423 instance ExtractNgramsT HyperdataContact
425 extractNgramsT l hc = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extract l hc
427 extract :: TermType Lang -> HyperdataContact
428 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
430 let authors = map text2ngrams
431 $ maybe ["Nothing"] (\a -> [a])
432 $ view (hc_who . _Just . cw_lastName) hc'
434 pure $ HashMap.fromList $ [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ]
437 instance ExtractNgramsT HyperdataDocument
439 extractNgramsT :: TermType Lang
441 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
442 extractNgramsT lang hd = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extractNgramsT' lang hd
444 extractNgramsT' :: TermType Lang
446 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
447 extractNgramsT' lang' doc = do
448 let source = text2ngrams
449 $ maybe "Nothing" identity
452 institutes = map text2ngrams
453 $ maybe ["Nothing"] (map toSchoolName . (T.splitOn ", "))
456 authors = map text2ngrams
457 $ maybe ["Nothing"] (T.splitOn ", ")
460 terms' <- map (enrichedTerms (lang' ^. tt_lang) CoreNLP NP)
462 <$> liftBase (extractTerms lang' $ hasText doc)
464 pure $ HashMap.fromList
465 $ [(SimpleNgrams source, Map.singleton Sources 1) ]
466 <> [(SimpleNgrams i', Map.singleton Institutes 1) | i' <- institutes ]
467 <> [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ]
468 <> [(EnrichedNgrams t', Map.singleton NgramsTerms 1) | t' <- terms' ]
470 instance (ExtractNgramsT a, HasText a) => ExtractNgramsT (Node a)
472 extractNgramsT l (Node _ _ _ _ _ _ _ h) = extractNgramsT l h
474 instance HasText a => HasText (Node a)
476 hasText (Node _ _ _ _ _ _ _ h) = hasText h
480 -- | TODO putelsewhere
481 -- | Upgrade function
482 -- Suppose all documents are English (this is the case actually)
483 indexAllDocumentsWithPosTag :: FlowCmdM env err m
485 indexAllDocumentsWithPosTag = do
486 rootId <- getRootId (UserName userMaster)
487 corpusIds <- findNodesId rootId [NodeCorpus]
488 docs <- List.concat <$> mapM getDocumentsWithParentId corpusIds
489 _ <- mapM extractInsert (splitEvery 1000 docs)
492 extractInsert :: FlowCmdM env err m
493 => [Node HyperdataDocument] -> m ()
494 extractInsert docs = do
495 let documentsWithId = map (\doc -> Indexed (doc ^. node_id) doc) docs
496 mapNgramsDocs' <- mapNodeIdNgrams
497 <$> documentIdWithNgrams
498 (extractNgramsT $ withLang (Multi EN) documentsWithId)
500 _ <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'