2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
20 {-# LANGUAGE ConstrainedClassMethods #-}
21 {-# LANGUAGE ConstraintKinds #-}
22 {-# LANGUAGE InstanceSigs #-}
23 {-# LANGUAGE ScopedTypeVariables #-}
24 {-# LANGUAGE TemplateHaskell #-}
26 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
41 , getOrMk_RootWithCorpus
47 , indexAllDocumentsWithPosTag
52 import Control.Lens ((^.), view, _Just, makeLenses, over, traverse)
53 import Control.Monad.Reader (MonadReader)
54 import Data.Aeson.TH (deriveJSON)
55 import Data.Conduit.Internal (zipSources)
57 import Data.HashMap.Strict (HashMap)
58 import Data.Hashable (Hashable)
59 import Data.List (concat)
60 import Data.Map (Map, lookup)
61 import Data.Maybe (catMaybes)
64 import qualified Data.Text as T
65 import Data.Tuple.Extra (first, second)
66 import GHC.Generics (Generic)
67 import Servant.Client (ClientError)
68 import System.FilePath (FilePath)
69 import qualified Data.HashMap.Strict as HashMap
70 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
71 import qualified Data.Map as Map
72 import qualified Data.Conduit.List as CL
73 import qualified Data.Conduit as C
75 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..))
76 import Gargantext.Core (Lang(..), PosTagAlgo(..))
77 import Gargantext.Core.Ext.IMT (toSchoolName)
78 import Gargantext.Core.Ext.IMTUser (readFile_Annuaire)
79 import Gargantext.Core.Flow.Types
80 import Gargantext.Core.Text
81 import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat, FileType)
82 import Gargantext.Core.Text.List (buildNgramsLists)
83 import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
84 import Gargantext.Core.Text.List.Social (FlowSocialListWith)
85 import Gargantext.Core.Text.Terms
86 import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
87 import Gargantext.Core.Types (POS(NP), TermsCount)
88 import Gargantext.Core.Types.Individu (User(..))
89 import Gargantext.Core.Types.Main
90 import Gargantext.Core.Utils (addTuples)
91 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
92 import Gargantext.Database.Action.Flow.List
93 import Gargantext.Database.Action.Flow.Types
94 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams, DocumentIdWithNgrams(..))
95 import Gargantext.Database.Action.Search (searchDocInDatabase)
96 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
97 import Gargantext.Database.Action.Metrics (updateNgramsOccurrences)
98 import Gargantext.Database.Admin.Types.Hyperdata
99 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
100 import Gargantext.Database.Prelude
101 import Gargantext.Database.Query.Table.ContextNodeNgrams2
102 import Gargantext.Database.Query.Table.Ngrams
103 import Gargantext.Database.Query.Table.Node
104 import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
105 import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
106 import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
107 import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
108 import Gargantext.Database.Schema.Node (NodePoly(..), node_id)
109 import Gargantext.Database.Types
110 import Gargantext.Prelude
111 import Gargantext.Prelude.Crypto.Hash (Hash)
112 import qualified Gargantext.Core.Text.Corpus.API as API
113 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
114 import qualified Prelude
116 ------------------------------------------------------------------------
117 -- Imports for upgrade function
118 import Gargantext.Database.Query.Tree.Root (getRootId)
119 import Gargantext.Database.Query.Tree (findNodesId)
120 import qualified Data.List as List
121 ------------------------------------------------------------------------
122 -- TODO use internal with API name (could be old data)
123 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
124 | ExternalOrigin { _do_api :: API.ExternalAPIs }
126 deriving (Generic, Eq)
128 makeLenses ''DataOrigin
129 deriveJSON (unPrefix "_do_") ''DataOrigin
130 instance ToSchema DataOrigin where
131 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
133 allDataOrigins :: ( MonadReader env m
134 , HasConfig env) => m [DataOrigin]
136 ext <- API.externalAPIs
138 pure $ map InternalOrigin ext
139 <> map ExternalOrigin ext
142 data DataText = DataOld ![NodeId]
143 | DataNew !(Maybe Integer, ConduitT () HyperdataDocument IO ())
144 --- | DataNew ![[HyperdataDocument]]
146 -- Show instance is not possible because of IO
147 printDataText :: DataText -> IO ()
148 printDataText (DataOld xs) = putStrLn $ show xs
149 printDataText (DataNew (maybeInt, conduitData)) = do
150 res <- C.runConduit (conduitData .| CL.consume)
151 putStrLn $ show (maybeInt, res)
153 -- TODO use the split parameter in config file
154 getDataText :: FlowCmdM env err m
159 -> m (Either ClientError DataText)
160 getDataText (ExternalOrigin api) la q li = liftBase $ do
161 eRes <- API.get api (_tt_lang la) q li
162 pure $ DataNew <$> eRes
164 getDataText (InternalOrigin _) _la q _li = do
165 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
166 (UserName userMaster)
168 (Nothing :: Maybe HyperdataCorpus)
169 ids <- map fst <$> searchDocInDatabase cId (stemIt q)
170 pure $ Right $ DataOld ids
172 getDataText_Debug :: FlowCmdM env err m
178 getDataText_Debug a l q li = do
179 result <- getDataText a l q li
181 Left err -> liftBase $ putStrLn $ show err
182 Right res -> liftBase $ printDataText res
185 -------------------------------------------------------------------------------
186 flowDataText :: forall env err m.
193 -> Maybe FlowSocialListWith
196 flowDataText u (DataOld ids) tt cid mfslw _ = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids mfslw
198 corpusType = (Nothing :: Maybe HyperdataCorpus)
199 flowDataText u (DataNew (mLen, txtC)) tt cid mfslw logStatus =
200 flowCorpus u (Right [cid]) tt mfslw (mLen, (transPipe liftBase txtC)) logStatus
202 ------------------------------------------------------------------------
204 flowAnnuaire :: (FlowCmdM env err m)
206 -> Either CorpusName [CorpusId]
211 flowAnnuaire u n l filePath logStatus = do
212 -- TODO Conduit for file
213 docs <- liftBase $ ((readFile_Annuaire filePath) :: IO [HyperdataContact])
214 flow (Nothing :: Maybe HyperdataAnnuaire) u n l Nothing (Just $ fromIntegral $ length docs, yieldMany docs) logStatus
216 ------------------------------------------------------------------------
217 flowCorpusFile :: (FlowCmdM env err m)
219 -> Either CorpusName [CorpusId]
220 -> Limit -- Limit the number of docs (for dev purpose)
225 -> Maybe FlowSocialListWith
228 flowCorpusFile u n _l la ft ff fp mfslw logStatus = do
229 eParsed <- liftBase $ parseFile ft ff fp
232 flowCorpus u n la mfslw (Just $ fromIntegral $ length parsed, yieldMany parsed .| mapC toHyperdataDocument) logStatus
233 --let docs = splitEvery 500 $ take l parsed
234 --flowCorpus u n la mfslw (yieldMany $ map (map toHyperdataDocument) docs) logStatus
235 Left e -> panic $ "Error: " <> T.pack e
237 ------------------------------------------------------------------------
238 -- | TODO improve the needed type to create/update a corpus
239 -- (For now, Either is enough)
240 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
242 -> Either CorpusName [CorpusId]
244 -> Maybe FlowSocialListWith
245 -> (Maybe Integer, ConduitT () a m ())
248 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
251 flow :: forall env err m a c.
258 -> Either CorpusName [CorpusId]
260 -> Maybe FlowSocialListWith
261 -> (Maybe Integer, ConduitT () a m ())
264 flow c u cn la mfslw (mLength, docsC) logStatus = do
265 -- TODO if public insertMasterDocs else insertUserDocs
266 ids <- runConduit $ zipSources (yieldMany [1..]) docsC
269 -- ids <- traverse (\(idx, doc) -> do
270 -- id <- insertMasterDocs c la doc
271 -- logStatus JobLog { _scst_succeeded = Just $ 1 + idx
272 -- , _scst_failed = Just 0
273 -- , _scst_remaining = Just $ length docs - idx
274 -- , _scst_events = Just []
277 -- ) (zip [1..] docs)
278 flowCorpusUser (la ^. tt_lang) u cn c ids mfslw
281 insertDoc :: (Integer, a) -> m NodeId
282 insertDoc (idx, doc) = do
283 id <- insertMasterDocs c la [doc]
287 logStatus JobLog { _scst_succeeded = Just $ fromIntegral $ 1 + idx
288 , _scst_failed = Just 0
289 , _scst_remaining = Just $ fromIntegral $ len - idx
290 , _scst_events = Just []
292 pure $ Prelude.head id
296 ------------------------------------------------------------------------
297 flowCorpusUser :: ( FlowCmdM env err m
302 -> Either CorpusName [CorpusId]
305 -> Maybe FlowSocialListWith
307 flowCorpusUser l user corpusName ctype ids mfslw = do
309 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
310 -- NodeTexts is first
311 _tId <- insertDefaultNodeIfNotExists NodeTexts userCorpusId userId
312 -- printDebug "NodeTexts: " tId
314 -- NodeList is second
315 listId <- getOrMkList userCorpusId userId
316 -- _cooc <- insertDefaultNode NodeListCooc listId userId
317 -- TODO: check if present already, ignore
318 _ <- Doc.add userCorpusId ids
320 -- printDebug "Node Text Ids:" tId
323 (masterUserId, _masterRootId, masterCorpusId)
324 <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
326 --let gp = (GroupParams l 2 3 (StopSize 3))
327 -- Here the PosTagAlgo should be chosen according to the Lang
328 let gp = GroupWithPosTag l CoreNLP HashMap.empty
329 ngs <- buildNgramsLists user userCorpusId masterCorpusId mfslw gp
331 -- printDebug "flowCorpusUser:ngs" ngs
333 _userListId <- flowList_DbRepo listId ngs
334 _mastListId <- getOrMkList masterCorpusId masterUserId
335 -- _ <- insertOccsUpdates userCorpusId mastListId
336 -- printDebug "userListId" userListId
338 _ <- insertDefaultNodeIfNotExists NodeGraph userCorpusId userId
339 _ <- insertDefaultNodeIfNotExists NodeDashboard userCorpusId userId
340 --_ <- mkPhylo userCorpusId userId
342 -- _ <- mkAnnuaire rootUserId userId
343 _ <- updateNgramsOccurrences userCorpusId (Just listId)
348 insertMasterDocs :: ( FlowCmdM env err m
356 insertMasterDocs c lang hs = do
357 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
358 (ids', documentsWithId) <- insertDocs masterUserId masterCorpusId (map (toNode masterUserId Nothing) hs )
359 _ <- Doc.add masterCorpusId ids'
361 -- create a corpus with database name (CSV or PubMed)
362 -- add documents to the corpus (create node_node link)
363 -- this will enable global database monitoring
365 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
366 mapNgramsDocs' :: HashMap ExtractedNgrams (Map NgramsType (Map NodeId (Int, TermsCount)))
368 <$> documentIdWithNgrams
369 (extractNgramsT $ withLang lang documentsWithId)
372 lId <- getOrMkList masterCorpusId masterUserId
373 -- _ <- saveDocNgramsWith lId mapNgramsDocs'
374 _ <- saveDocNgramsWith lId mapNgramsDocs'
376 -- _cooc <- insertDefaultNode NodeListCooc lId masterUserId
379 saveDocNgramsWith :: (FlowCmdM env err m)
381 -> HashMap ExtractedNgrams (Map NgramsType (Map NodeId (Int, TermsCount)))
383 saveDocNgramsWith lId mapNgramsDocs' = do
384 --printDebug "[saveDocNgramsWith] mapNgramsDocs'" mapNgramsDocs'
385 let mapNgramsDocsNoCount = over (traverse . traverse . traverse) fst mapNgramsDocs'
386 terms2id <- insertExtractedNgrams $ HashMap.keys mapNgramsDocsNoCount
388 let mapNgramsDocs = HashMap.mapKeys extracted2ngrams mapNgramsDocs'
391 mapCgramsId <- listInsertDb lId toNodeNgramsW'
392 $ map (first _ngramsTerms . second Map.keys)
393 $ HashMap.toList mapNgramsDocs
395 --printDebug "saveDocNgramsWith" mapCgramsId
397 _return <- insertContextNodeNgrams2
398 $ catMaybes [ ContextNodeNgrams2 <$> Just nId
399 <*> (getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms''))
400 <*> Just (fromIntegral w :: Double)
401 | (terms'', mapNgramsTypes) <- HashMap.toList mapNgramsDocs
402 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
403 , (nId, (w, _cnt)) <- Map.toList mapNodeIdWeight
407 _ <- insertDocNgrams lId $ HashMap.mapKeys (indexNgrams terms2id) mapNgramsDocs
412 ------------------------------------------------------------------------
413 -- TODO Type NodeDocumentUnicised
414 insertDocs :: ( FlowCmdM env err m
421 -> m ([ContextId], [Indexed ContextId a])
422 insertDocs uId cId hs = do
423 let docs = map addUniqId hs
424 newIds <- insertDb uId Nothing docs
425 -- printDebug "newIds" newIds
427 newIds' = map reId newIds
428 documentsWithId = mergeData (toInserted newIds) (Map.fromList $ map viewUniqId' docs)
429 _ <- Doc.add cId newIds'
430 pure (newIds', documentsWithId)
433 ------------------------------------------------------------------------
434 viewUniqId' :: UniqId a
437 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
439 err = panic "[ERROR] Database.Flow.toInsert"
442 toInserted :: [ReturnId]
445 Map.fromList . map (\r -> (reUniqId r, r) )
446 . filter (\r -> reInserted r == True)
448 mergeData :: Map Hash ReturnId
450 -> [Indexed NodeId a]
451 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
453 toDocumentWithId (sha,hpd) =
454 Indexed <$> fmap reId (lookup sha rs)
457 ------------------------------------------------------------------------
458 ------------------------------------------------------------------------
459 ------------------------------------------------------------------------
460 documentIdWithNgrams :: HasNodeError err
462 -> Cmd err (HashMap b (Map NgramsType Int, TermsCount)))
463 -> [Indexed NodeId a]
464 -> Cmd err [DocumentIdWithNgrams a b]
465 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
467 toDocumentIdWithNgrams d = do
469 pure $ DocumentIdWithNgrams d e
472 -- | TODO check optimization
473 mapNodeIdNgrams :: (Ord b, Hashable b)
474 => [DocumentIdWithNgrams a b]
477 (Map NodeId (Int, TermsCount))
479 mapNodeIdNgrams = HashMap.unionsWith (Map.unionWith (Map.unionWith addTuples)) . fmap f
481 -- | NOTE We are somehow multiplying 'TermsCount' here: If the
482 -- same ngrams term has different ngrams types, the 'TermsCount'
483 -- for it (which is the number of times the terms appears in a
484 -- document) is copied over to all its types.
485 f :: DocumentIdWithNgrams a b
486 -> HashMap b (Map NgramsType (Map NodeId (Int, TermsCount)))
487 f d = fmap (\(ngramsTypeMap, cnt) -> fmap (\i -> Map.singleton nId (i, cnt)) ngramsTypeMap) $ documentNgrams d
489 nId = _index $ documentWithId d
492 ------------------------------------------------------------------------
493 instance ExtractNgramsT HyperdataContact
495 extractNgramsT l hc = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extract l hc
497 extract :: TermType Lang -> HyperdataContact
498 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int, TermsCount))
500 let authors = map text2ngrams
501 $ maybe ["Nothing"] (\a -> [a])
502 $ view (hc_who . _Just . cw_lastName) hc'
504 pure $ HashMap.fromList $ [(SimpleNgrams a', (Map.singleton Authors 1, 1)) | a' <- authors ]
507 instance ExtractNgramsT HyperdataDocument
509 extractNgramsT :: TermType Lang
511 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int, TermsCount))
512 extractNgramsT lang hd = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extractNgramsT' lang hd
514 extractNgramsT' :: TermType Lang
516 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int, TermsCount))
517 extractNgramsT' lang' doc = do
518 let source = text2ngrams
519 $ maybe "Nothing" identity
522 institutes = map text2ngrams
523 $ maybe ["Nothing"] (map toSchoolName . (T.splitOn ", "))
526 authors = map text2ngrams
527 $ maybe ["Nothing"] (T.splitOn ", ")
530 termsWithCounts' <- map (\(t, cnt) -> (enrichedTerms (lang' ^. tt_lang) CoreNLP NP t, cnt))
532 <$> liftBase (extractTerms lang' $ hasText doc)
534 pure $ HashMap.fromList
535 $ [(SimpleNgrams source, (Map.singleton Sources 1, 1)) ]
536 <> [(SimpleNgrams i', (Map.singleton Institutes 1, 1)) | i' <- institutes ]
537 <> [(SimpleNgrams a', (Map.singleton Authors 1, 1)) | a' <- authors ]
538 <> [(EnrichedNgrams t', (Map.singleton NgramsTerms 1, cnt')) | (t', cnt') <- termsWithCounts' ]
540 instance (ExtractNgramsT a, HasText a) => ExtractNgramsT (Node a)
542 extractNgramsT l (Node { _node_hyperdata = h }) = extractNgramsT l h
544 instance HasText a => HasText (Node a)
546 hasText (Node { _node_hyperdata = h }) = hasText h
550 -- | TODO putelsewhere
551 -- | Upgrade function
552 -- Suppose all documents are English (this is the case actually)
553 indexAllDocumentsWithPosTag :: FlowCmdM env err m
555 indexAllDocumentsWithPosTag = do
556 rootId <- getRootId (UserName userMaster)
557 corpusIds <- findNodesId rootId [NodeCorpus]
558 docs <- List.concat <$> mapM getDocumentsWithParentId corpusIds
559 _ <- mapM extractInsert (splitEvery 1000 docs)
562 extractInsert :: FlowCmdM env err m
563 => [Node HyperdataDocument] -> m ()
564 extractInsert docs = do
565 let documentsWithId = map (\doc -> Indexed (doc ^. node_id) doc) docs
566 mapNgramsDocs' <- mapNodeIdNgrams
567 <$> documentIdWithNgrams
568 (extractNgramsT $ withLang (Multi EN) documentsWithId)
570 _ <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'