2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
20 {-# LANGUAGE ConstrainedClassMethods #-}
21 {-# LANGUAGE ConstraintKinds #-}
22 {-# LANGUAGE InstanceSigs #-}
23 {-# LANGUAGE ScopedTypeVariables #-}
24 {-# LANGUAGE TemplateHaskell #-}
26 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
39 , getOrMk_RootWithCorpus
45 , indexAllDocumentsWithPosTag
50 import Control.Lens ((^.), view, _Just, makeLenses)
51 import Data.Aeson.TH (deriveJSON)
52 import Data.Conduit.Internal (zipSources)
54 import Data.HashMap.Strict (HashMap)
55 import Data.Hashable (Hashable)
56 import Data.List (concat)
57 import Data.Map (Map, lookup)
58 import Data.Maybe (catMaybes)
61 import qualified Data.Text as T
62 import Data.Traversable (traverse)
63 import Data.Tuple.Extra (first, second)
64 import GHC.Generics (Generic)
65 import System.FilePath (FilePath)
66 import qualified Data.HashMap.Strict as HashMap
67 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
68 import qualified Data.Map as Map
70 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..))
71 import Gargantext.Core (Lang(..), PosTagAlgo(..))
72 import Gargantext.Core.Ext.IMT (toSchoolName)
73 import Gargantext.Core.Ext.IMTUser (readFile_Annuaire)
74 import Gargantext.Core.Flow.Types
75 import Gargantext.Core.Text
76 import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat)
77 import Gargantext.Core.Text.List (buildNgramsLists)
78 import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
79 import Gargantext.Core.Text.List.Social (FlowSocialListWith)
80 import Gargantext.Core.Text.Terms
81 import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
82 import Gargantext.Core.Types (POS(NP))
83 import Gargantext.Core.Types.Individu (User(..))
84 import Gargantext.Core.Types.Main
85 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
86 import Gargantext.Database.Action.Flow.List
87 import Gargantext.Database.Action.Flow.Types
88 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams, DocumentIdWithNgrams(..))
89 import Gargantext.Database.Action.Search (searchDocInDatabase)
90 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
91 import Gargantext.Database.Action.Metrics (updateNgramsOccurrences)
92 import Gargantext.Database.Admin.Types.Hyperdata
93 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
94 import Gargantext.Database.Prelude
95 import Gargantext.Database.Query.Table.ContextNodeNgrams2
96 import Gargantext.Database.Query.Table.Ngrams
97 import Gargantext.Database.Query.Table.Node
98 import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
99 import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
100 import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
101 import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
102 import Gargantext.Database.Schema.Node (NodePoly(..), node_id)
103 import Gargantext.Database.Types
104 import Gargantext.Prelude
105 import Gargantext.Prelude.Crypto.Hash (Hash)
106 import qualified Gargantext.Core.Text.Corpus.API as API
107 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
108 import qualified Prelude as Prelude
110 ------------------------------------------------------------------------
111 -- Imports for upgrade function
112 import Gargantext.Database.Query.Tree.Root (getRootId)
113 import Gargantext.Database.Query.Tree (findNodesId)
114 import qualified Data.List as List
115 ------------------------------------------------------------------------
116 -- TODO use internal with API name (could be old data)
117 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
118 | ExternalOrigin { _do_api :: API.ExternalAPIs }
120 deriving (Generic, Eq)
122 makeLenses ''DataOrigin
123 deriveJSON (unPrefix "_do_") ''DataOrigin
124 instance ToSchema DataOrigin where
125 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
127 allDataOrigins :: [DataOrigin]
128 allDataOrigins = map InternalOrigin API.externalAPIs
129 <> map ExternalOrigin API.externalAPIs
132 data DataText = DataOld ![NodeId]
133 | DataNew !(ConduitT () HyperdataDocument IO ())
134 -- | DataNew ![[HyperdataDocument]]
136 -- TODO use the split parameter in config file
137 getDataText :: FlowCmdM env err m
143 getDataText (ExternalOrigin api) la q li = liftBase $ do
144 docsC <- API.get api (_tt_lang la) q li
147 getDataText (InternalOrigin _) _la q _li = do
148 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
149 (UserName userMaster)
151 (Nothing :: Maybe HyperdataCorpus)
152 ids <- map fst <$> searchDocInDatabase cId (stemIt q)
155 -------------------------------------------------------------------------------
156 flowDataText :: forall env err m.
163 -> Maybe FlowSocialListWith
166 flowDataText u (DataOld ids) tt cid mfslw _ = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids mfslw
168 corpusType = (Nothing :: Maybe HyperdataCorpus)
169 flowDataText u (DataNew txtC) tt cid mfslw logStatus = flowCorpus u (Right [cid]) tt mfslw (transPipe liftBase txtC) logStatus
171 ------------------------------------------------------------------------
173 flowAnnuaire :: (FlowCmdM env err m)
175 -> Either CorpusName [CorpusId]
180 flowAnnuaire u n l filePath logStatus = do
181 -- TODO Conduit for file
182 docs <- liftBase $ ((readFile_Annuaire filePath) :: IO [HyperdataContact])
183 flow (Nothing :: Maybe HyperdataAnnuaire) u n l Nothing (yieldMany docs) logStatus
185 ------------------------------------------------------------------------
186 flowCorpusFile :: (FlowCmdM env err m)
188 -> Either CorpusName [CorpusId]
189 -> Limit -- Limit the number of docs (for dev purpose)
190 -> TermType Lang -> FileFormat -> FilePath
191 -> Maybe FlowSocialListWith
194 flowCorpusFile u n _l la ff fp mfslw logStatus = do
195 eParsed <- liftBase $ parseFile ff fp
198 flowCorpus u n la mfslw (yieldMany parsed .| mapC toHyperdataDocument) logStatus
199 --let docs = splitEvery 500 $ take l parsed
200 --flowCorpus u n la mfslw (yieldMany $ map (map toHyperdataDocument) docs) logStatus
201 Left e -> panic $ "Error: " <> (T.pack e)
203 ------------------------------------------------------------------------
204 -- | TODO improve the needed type to create/update a corpus
205 -- (For now, Either is enough)
206 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
208 -> Either CorpusName [CorpusId]
210 -> Maybe FlowSocialListWith
211 -> ConduitT () a m ()
214 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
217 flow :: forall env err m a c.
224 -> Either CorpusName [CorpusId]
226 -> Maybe FlowSocialListWith
227 -> ConduitT () a m ()
230 flow c u cn la mfslw docsC _logStatus = do
231 -- TODO if public insertMasterDocs else insertUserDocs
233 zipSources (yieldMany [1..]) docsC
236 -- ids <- traverse (\(idx, doc) -> do
237 -- id <- insertMasterDocs c la doc
238 -- logStatus JobLog { _scst_succeeded = Just $ 1 + idx
239 -- , _scst_failed = Just 0
240 -- , _scst_remaining = Just $ length docs - idx
241 -- , _scst_events = Just []
244 -- ) (zip [1..] docs)
245 flowCorpusUser (la ^. tt_lang) u cn c ids mfslw
248 insertDoc :: (Int, a) -> m NodeId
249 insertDoc (_idx, doc) = do
250 id <- insertMasterDocs c la [doc]
251 -- logStatus JobLog { _scst_succeeded = Just $ 1 + idx
252 -- , _scst_failed = Just 0
253 -- , _scst_remaining = Just $ length docs - idx
254 -- , _scst_events = Just []
256 pure $ Prelude.head id
260 ------------------------------------------------------------------------
261 flowCorpusUser :: ( FlowCmdM env err m
266 -> Either CorpusName [CorpusId]
269 -> Maybe FlowSocialListWith
271 flowCorpusUser l user corpusName ctype ids mfslw = do
273 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
274 -- NodeTexts is first
275 _tId <- insertDefaultNode NodeTexts userCorpusId userId
276 -- printDebug "NodeTexts: " tId
278 -- NodeList is second
279 listId <- getOrMkList userCorpusId userId
280 -- _cooc <- insertDefaultNode NodeListCooc listId userId
281 -- TODO: check if present already, ignore
282 _ <- Doc.add userCorpusId ids
284 -- printDebug "Node Text Ids:" tId
287 (masterUserId, _masterRootId, masterCorpusId)
288 <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
290 --let gp = (GroupParams l 2 3 (StopSize 3))
291 let gp = GroupWithPosTag l CoreNLP HashMap.empty
292 ngs <- buildNgramsLists user userCorpusId masterCorpusId mfslw gp
294 -- printDebug "flowCorpusUser:ngs" ngs
296 _userListId <- flowList_DbRepo listId ngs
297 _mastListId <- getOrMkList masterCorpusId masterUserId
298 -- _ <- insertOccsUpdates userCorpusId mastListId
299 -- printDebug "userListId" userListId
301 _ <- insertDefaultNode NodeDashboard userCorpusId userId
302 _ <- insertDefaultNode NodeGraph userCorpusId userId
303 --_ <- mkPhylo userCorpusId userId
305 -- _ <- mkAnnuaire rootUserId userId
306 _ <- updateNgramsOccurrences userCorpusId (Just listId)
311 insertMasterDocs :: ( FlowCmdM env err m
319 insertMasterDocs c lang hs = do
320 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
321 (ids', documentsWithId) <- insertDocs masterUserId masterCorpusId (map (toNode masterUserId masterCorpusId) hs )
322 _ <- Doc.add masterCorpusId ids'
324 -- create a corpus with database name (CSV or PubMed)
325 -- add documents to the corpus (create node_node link)
326 -- this will enable global database monitoring
328 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
329 mapNgramsDocs' :: HashMap ExtractedNgrams (Map NgramsType (Map NodeId Int))
331 <$> documentIdWithNgrams
332 (extractNgramsT $ withLang lang documentsWithId)
335 lId <- getOrMkList masterCorpusId masterUserId
336 _ <- saveDocNgramsWith lId mapNgramsDocs'
338 -- _cooc <- insertDefaultNode NodeListCooc lId masterUserId
341 saveDocNgramsWith :: ( FlowCmdM env err m)
343 -> HashMap ExtractedNgrams (Map NgramsType (Map NodeId Int))
345 saveDocNgramsWith lId mapNgramsDocs' = do
346 terms2id <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
347 printDebug "terms2id" terms2id
349 let mapNgramsDocs = HashMap.mapKeys extracted2ngrams mapNgramsDocs'
352 mapCgramsId <- listInsertDb lId toNodeNgramsW'
353 $ map (first _ngramsTerms . second Map.keys)
354 $ HashMap.toList mapNgramsDocs
356 printDebug "saveDocNgramsWith" mapCgramsId
358 _return <- insertContextNodeNgrams2
359 $ catMaybes [ ContextNodeNgrams2 <$> Just nId
360 <*> (getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms''))
361 <*> Just (fromIntegral w :: Double)
362 | (terms'', mapNgramsTypes) <- HashMap.toList mapNgramsDocs
363 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
364 , (nId, w) <- Map.toList mapNodeIdWeight
368 _ <- insertDocNgrams lId $ HashMap.mapKeys (indexNgrams terms2id) mapNgramsDocs
373 ------------------------------------------------------------------------
374 -- TODO Type NodeDocumentUnicised
375 insertDocs :: ( FlowCmdM env err m
382 -> m ([ContextId], [Indexed ContextId a])
383 insertDocs uId cId hs = do
384 let docs = map addUniqId hs
385 newIds <- insertDb uId cId docs
386 -- printDebug "newIds" newIds
388 newIds' = map reId newIds
389 documentsWithId = mergeData (toInserted newIds) (Map.fromList $ map viewUniqId' docs)
390 _ <- Doc.add cId newIds'
391 pure (newIds', documentsWithId)
394 ------------------------------------------------------------------------
395 viewUniqId' :: UniqId a
398 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
400 err = panic "[ERROR] Database.Flow.toInsert"
403 toInserted :: [ReturnId]
406 Map.fromList . map (\r -> (reUniqId r, r) )
407 . filter (\r -> reInserted r == True)
409 mergeData :: Map Hash ReturnId
411 -> [Indexed NodeId a]
412 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
414 toDocumentWithId (sha,hpd) =
415 Indexed <$> fmap reId (lookup sha rs)
418 ------------------------------------------------------------------------
419 ------------------------------------------------------------------------
420 ------------------------------------------------------------------------
421 documentIdWithNgrams :: HasNodeError err
423 -> Cmd err (HashMap b (Map NgramsType Int)))
424 -> [Indexed NodeId a]
425 -> Cmd err [DocumentIdWithNgrams a b]
426 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
428 toDocumentIdWithNgrams d = do
430 pure $ DocumentIdWithNgrams d e
433 -- | TODO check optimization
434 mapNodeIdNgrams :: (Ord b, Hashable b)
435 => [DocumentIdWithNgrams a b]
440 mapNodeIdNgrams = HashMap.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
442 f :: DocumentIdWithNgrams a b
443 -> HashMap b (Map NgramsType (Map NodeId Int))
444 f d = fmap (fmap (Map.singleton nId)) $ documentNgrams d
446 nId = _index $ documentWithId d
449 ------------------------------------------------------------------------
450 instance ExtractNgramsT HyperdataContact
452 extractNgramsT l hc = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extract l hc
454 extract :: TermType Lang -> HyperdataContact
455 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
457 let authors = map text2ngrams
458 $ maybe ["Nothing"] (\a -> [a])
459 $ view (hc_who . _Just . cw_lastName) hc'
461 pure $ HashMap.fromList $ [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ]
464 instance ExtractNgramsT HyperdataDocument
466 extractNgramsT :: TermType Lang
468 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
469 extractNgramsT lang hd = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extractNgramsT' lang hd
471 extractNgramsT' :: TermType Lang
473 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
474 extractNgramsT' lang' doc = do
475 let source = text2ngrams
476 $ maybe "Nothing" identity
479 institutes = map text2ngrams
480 $ maybe ["Nothing"] (map toSchoolName . (T.splitOn ", "))
483 authors = map text2ngrams
484 $ maybe ["Nothing"] (T.splitOn ", ")
487 terms' <- map (enrichedTerms (lang' ^. tt_lang) CoreNLP NP)
489 <$> liftBase (extractTerms lang' $ hasText doc)
491 pure $ HashMap.fromList
492 $ [(SimpleNgrams source, Map.singleton Sources 1) ]
493 <> [(SimpleNgrams i', Map.singleton Institutes 1) | i' <- institutes ]
494 <> [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ]
495 <> [(EnrichedNgrams t', Map.singleton NgramsTerms 1) | t' <- terms' ]
497 instance (ExtractNgramsT a, HasText a) => ExtractNgramsT (Node a)
499 extractNgramsT l (Node _ _ _ _ _ _ _ h) = extractNgramsT l h
501 instance HasText a => HasText (Node a)
503 hasText (Node _ _ _ _ _ _ _ h) = hasText h
507 -- | TODO putelsewhere
508 -- | Upgrade function
509 -- Suppose all documents are English (this is the case actually)
510 indexAllDocumentsWithPosTag :: FlowCmdM env err m
512 indexAllDocumentsWithPosTag = do
513 rootId <- getRootId (UserName userMaster)
514 corpusIds <- findNodesId rootId [NodeCorpus]
515 docs <- List.concat <$> mapM getDocumentsWithParentId corpusIds
516 _ <- mapM extractInsert (splitEvery 1000 docs)
519 extractInsert :: FlowCmdM env err m
520 => [Node HyperdataDocument] -> m ()
521 extractInsert docs = do
522 let documentsWithId = map (\doc -> Indexed (doc ^. node_id) doc) docs
523 mapNgramsDocs' <- mapNodeIdNgrams
524 <$> documentIdWithNgrams
525 (extractNgramsT $ withLang (Multi EN) documentsWithId)
527 _ <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'