2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
20 {-# LANGUAGE ConstrainedClassMethods #-}
21 {-# LANGUAGE ConstraintKinds #-}
22 {-# LANGUAGE InstanceSigs #-}
23 {-# LANGUAGE ScopedTypeVariables #-}
24 {-# LANGUAGE TemplateHaskell #-}
26 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
41 , getOrMk_RootWithCorpus
47 , indexAllDocumentsWithPosTag
52 import Control.Lens ((^.), view, _Just, makeLenses, over, traverse)
53 import Control.Monad.Reader (MonadReader)
54 import Data.Aeson.TH (deriveJSON)
55 import Data.Conduit.Internal (zipSources)
56 import qualified Data.Conduit.List as CList
58 import Data.HashMap.Strict (HashMap)
59 import Data.Hashable (Hashable)
60 import Data.List (concat)
61 import Data.Map.Strict (Map, lookup)
62 import Data.Maybe (catMaybes)
65 import qualified Data.Text as T
66 import Data.Tuple.Extra (first, second)
67 import GHC.Generics (Generic)
68 import Servant.Client (ClientError)
69 import System.FilePath (FilePath)
70 import qualified Data.HashMap.Strict as HashMap
71 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
72 import qualified Data.Map.Strict as Map
73 import qualified Data.Conduit.List as CL
74 import qualified Data.Conduit as C
76 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..))
77 import Gargantext.Core (Lang(..), PosTagAlgo(..))
78 -- import Gargantext.Core.Ext.IMT (toSchoolName)
79 import Gargantext.Core.Ext.IMTUser (readFile_Annuaire)
80 import Gargantext.Core.Flow.Types
81 import Gargantext.Core.NLP (nlpServerGet)
82 import Gargantext.Core.Text
83 import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat, FileType, splitOn)
84 import Gargantext.Core.Text.List (buildNgramsLists)
85 import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
86 import Gargantext.Core.Text.List.Social (FlowSocialListWith(..))
87 import Gargantext.Core.Text.Terms
88 import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
89 import Gargantext.Core.Types (POS(NP), TermsCount)
90 import Gargantext.Core.Types.Individu (User(..))
91 import Gargantext.Core.Types.Main
92 import Gargantext.Core.Utils (addTuples)
93 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
94 import Gargantext.Database.Action.Flow.List
95 import Gargantext.Database.Action.Flow.Types
96 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams, DocumentIdWithNgrams(..))
97 import Gargantext.Database.Action.Search (searchDocInDatabase)
98 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
99 import Gargantext.Database.Action.Metrics (updateNgramsOccurrences)
100 import Gargantext.Database.Admin.Types.Hyperdata
101 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
102 import Gargantext.Database.Prelude
103 import Gargantext.Database.Query.Table.ContextNodeNgrams2
104 import Gargantext.Database.Query.Table.Ngrams
105 import Gargantext.Database.Query.Table.Node
106 import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
107 import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
108 import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
109 import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
110 import Gargantext.Database.Schema.Node (NodePoly(..), node_id)
111 import Gargantext.Database.Types
112 import Gargantext.Prelude
113 import Gargantext.Prelude.Crypto.Hash (Hash)
114 import qualified Gargantext.Core.Text.Corpus.API as API
115 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
116 --import qualified Prelude
118 ------------------------------------------------------------------------
119 -- Imports for upgrade function
120 import Gargantext.Database.Query.Tree.Root (getRootId)
121 import Gargantext.Database.Query.Tree (findNodesId)
122 import qualified Data.List as List
123 ------------------------------------------------------------------------
124 -- TODO use internal with API name (could be old data)
125 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
126 | ExternalOrigin { _do_api :: API.ExternalAPIs }
128 deriving (Generic, Eq)
130 makeLenses ''DataOrigin
131 deriveJSON (unPrefix "_do_") ''DataOrigin
132 instance ToSchema DataOrigin where
133 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
135 allDataOrigins :: ( MonadReader env m
136 , HasConfig env) => m [DataOrigin]
138 ext <- API.externalAPIs
140 pure $ map InternalOrigin ext
141 <> map ExternalOrigin ext
144 data DataText = DataOld ![NodeId]
145 | DataNew !(Maybe Integer, ConduitT () HyperdataDocument IO ())
146 --- | DataNew ![[HyperdataDocument]]
148 -- Show instance is not possible because of IO
149 printDataText :: DataText -> IO ()
150 printDataText (DataOld xs) = putStrLn $ show xs
151 printDataText (DataNew (maybeInt, conduitData)) = do
152 res <- C.runConduit (conduitData .| CL.consume)
153 putStrLn $ show (maybeInt, res)
155 -- TODO use the split parameter in config file
156 getDataText :: FlowCmdM env err m
161 -> m (Either ClientError DataText)
162 getDataText (ExternalOrigin api) la q li = liftBase $ do
163 eRes <- API.get api (_tt_lang la) q li
164 pure $ DataNew <$> eRes
166 getDataText (InternalOrigin _) _la q _li = do
167 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
168 (UserName userMaster)
170 (Nothing :: Maybe HyperdataCorpus)
171 ids <- map fst <$> searchDocInDatabase cId (stemIt q)
172 pure $ Right $ DataOld ids
174 getDataText_Debug :: FlowCmdM env err m
180 getDataText_Debug a l q li = do
181 result <- getDataText a l q li
183 Left err -> liftBase $ putStrLn $ show err
184 Right res -> liftBase $ printDataText res
187 -------------------------------------------------------------------------------
188 flowDataText :: forall env err m.
195 -> Maybe FlowSocialListWith
198 flowDataText u (DataOld ids) tt cid mfslw _ = do
199 (_userId, userCorpusId, listId) <- createNodes u (Right [cid]) corpusType
200 _ <- Doc.add userCorpusId ids
201 flowCorpusUser (_tt_lang tt) u userCorpusId listId corpusType mfslw
203 corpusType = (Nothing :: Maybe HyperdataCorpus)
204 flowDataText u (DataNew (mLen, txtC)) tt cid mfslw logStatus =
205 flowCorpus u (Right [cid]) tt mfslw (mLen, (transPipe liftBase txtC)) logStatus
207 ------------------------------------------------------------------------
209 flowAnnuaire :: (FlowCmdM env err m)
211 -> Either CorpusName [CorpusId]
216 flowAnnuaire u n l filePath logStatus = do
217 -- TODO Conduit for file
218 docs <- liftBase $ ((readFile_Annuaire filePath) :: IO [HyperdataContact])
219 flow (Nothing :: Maybe HyperdataAnnuaire) u n l Nothing (Just $ fromIntegral $ length docs, yieldMany docs) logStatus
221 ------------------------------------------------------------------------
222 flowCorpusFile :: (FlowCmdM env err m)
224 -> Either CorpusName [CorpusId]
225 -> Limit -- Limit the number of docs (for dev purpose)
230 -> Maybe FlowSocialListWith
233 flowCorpusFile u n _l la ft ff fp mfslw logStatus = do
234 eParsed <- liftBase $ parseFile ft ff fp
237 flowCorpus u n la mfslw (Just $ fromIntegral $ length parsed, yieldMany parsed .| mapC toHyperdataDocument) logStatus
238 --let docs = splitEvery 500 $ take l parsed
239 --flowCorpus u n la mfslw (yieldMany $ map (map toHyperdataDocument) docs) logStatus
240 Left e -> panic $ "Error: " <> T.pack e
242 ------------------------------------------------------------------------
243 -- | TODO improve the needed type to create/update a corpus
244 -- (For now, Either is enough)
245 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
247 -> Either CorpusName [CorpusId]
249 -> Maybe FlowSocialListWith
250 -> (Maybe Integer, ConduitT () a m ())
253 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
256 flow :: forall env err m a c.
263 -> Either CorpusName [CorpusId]
265 -> Maybe FlowSocialListWith
266 -> (Maybe Integer, ConduitT () a m ())
269 flow c u cn la mfslw (mLength, docsC) logStatus = do
270 (_userId, userCorpusId, listId) <- createNodes u cn c
271 -- TODO if public insertMasterDocs else insertUserDocs
272 _ <- runConduit $ zipSources (yieldMany [1..]) docsC
273 .| CList.chunksOf 100
275 .| mapM_C (\ids' -> do
276 _ <- Doc.add userCorpusId ids'
280 _ <- flowCorpusUser (la ^. tt_lang) u userCorpusId listId c mfslw
282 -- ids <- traverse (\(idx, doc) -> do
283 -- id <- insertMasterDocs c la doc
284 -- logStatus JobLog { _scst_succeeded = Just $ 1 + idx
285 -- , _scst_failed = Just 0
286 -- , _scst_remaining = Just $ length docs - idx
287 -- , _scst_events = Just []
290 -- ) (zip [1..] docs)
291 --printDebug "[flow] calling flowCorpusUser" (0 :: Int)
293 --flowCorpusUser (la ^. tt_lang) u cn c ids mfslw
296 insertDocs' :: [(Integer, a)] -> m [NodeId]
297 insertDocs' [] = pure []
298 insertDocs' docs = do
299 -- printDebug "[flow] calling insertDoc, ([idx], mLength) = " (fst <$> docs, mLength)
300 ids <- insertMasterDocs c la (snd <$> docs)
301 let maxIdx = maximum (fst <$> docs)
305 logStatus JobLog { _scst_succeeded = Just $ fromIntegral $ 1 + maxIdx
306 , _scst_failed = Just 0
307 , _scst_remaining = Just $ fromIntegral $ len - maxIdx
308 , _scst_events = Just []
314 ------------------------------------------------------------------------
315 createNodes :: ( FlowCmdM env err m
319 -> Either CorpusName [CorpusId]
321 -> m (UserId, CorpusId, ListId)
322 createNodes user corpusName ctype = do
324 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
325 -- NodeTexts is first
326 _tId <- insertDefaultNodeIfNotExists NodeTexts userCorpusId userId
327 -- printDebug "NodeTexts: " tId
329 -- NodeList is second
330 listId <- getOrMkList userCorpusId userId
333 _ <- insertDefaultNodeIfNotExists NodeGraph userCorpusId userId
334 _ <- insertDefaultNodeIfNotExists NodeDashboard userCorpusId userId
336 pure (userId, userCorpusId, listId)
339 flowCorpusUser :: ( FlowCmdM env err m
347 -> Maybe FlowSocialListWith
349 flowCorpusUser l user userCorpusId listId ctype mfslw = do
350 server <- view (nlpServerGet l)
352 (masterUserId, _masterRootId, masterCorpusId)
353 <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
355 --let gp = (GroupParams l 2 3 (StopSize 3))
356 -- Here the PosTagAlgo should be chosen according to the Lang
358 (Just (NoList _)) -> do
359 -- printDebug "Do not build list" mfslw
362 ngs <- buildNgramsLists user userCorpusId masterCorpusId mfslw
363 $ GroupWithPosTag l server HashMap.empty
365 -- printDebug "flowCorpusUser:ngs" ngs
367 _userListId <- flowList_DbRepo listId ngs
368 _mastListId <- getOrMkList masterCorpusId masterUserId
370 -- _ <- insertOccsUpdates userCorpusId mastListId
371 -- printDebug "userListId" userListId
372 --_ <- mkPhylo userCorpusId userId
374 -- _ <- mkAnnuaire rootUserId userId
375 _ <- updateNgramsOccurrences userCorpusId (Just listId)
380 insertMasterDocs :: ( FlowCmdM env err m
388 insertMasterDocs c lang hs = do
389 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
390 (ids', documentsWithId) <- insertDocs masterUserId masterCorpusId (map (toNode masterUserId Nothing) hs )
391 _ <- Doc.add masterCorpusId ids'
393 -- create a corpus with database name (CSV or PubMed)
394 -- add documents to the corpus (create node_node link)
395 -- this will enable global database monitoring
397 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
398 mapNgramsDocs' :: HashMap ExtractedNgrams (Map NgramsType (Map NodeId (Int, TermsCount)))
400 <$> documentIdWithNgrams
401 (extractNgramsT $ withLang lang documentsWithId)
404 lId <- getOrMkList masterCorpusId masterUserId
405 -- _ <- saveDocNgramsWith lId mapNgramsDocs'
406 _ <- saveDocNgramsWith lId mapNgramsDocs'
408 -- _cooc <- insertDefaultNode NodeListCooc lId masterUserId
411 saveDocNgramsWith :: (FlowCmdM env err m)
413 -> HashMap ExtractedNgrams (Map NgramsType (Map NodeId (Int, TermsCount)))
415 saveDocNgramsWith lId mapNgramsDocs' = do
416 --printDebug "[saveDocNgramsWith] mapNgramsDocs'" mapNgramsDocs'
417 let mapNgramsDocsNoCount = over (traverse . traverse . traverse) fst mapNgramsDocs'
418 terms2id <- insertExtractedNgrams $ HashMap.keys mapNgramsDocsNoCount
420 let mapNgramsDocs = HashMap.mapKeys extracted2ngrams mapNgramsDocs'
423 mapCgramsId <- listInsertDb lId toNodeNgramsW'
424 $ map (first _ngramsTerms . second Map.keys)
425 $ HashMap.toList mapNgramsDocs
427 --printDebug "saveDocNgramsWith" mapCgramsId
429 let ngrams2insert = catMaybes [ ContextNodeNgrams2 <$> Just nId
430 <*> (getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms''))
431 <*> Just (fromIntegral w :: Double)
432 | (terms'', mapNgramsTypes) <- HashMap.toList mapNgramsDocs
433 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
434 , (nId, (w, _cnt)) <- Map.toList mapNodeIdWeight
436 -- printDebug "Ngrams2Insert" ngrams2insert
437 _return <- insertContextNodeNgrams2 ngrams2insert
440 _ <- insertDocNgrams lId $ HashMap.mapKeys (indexNgrams terms2id) mapNgramsDocs
445 ------------------------------------------------------------------------
446 -- TODO Type NodeDocumentUnicised
447 insertDocs :: ( FlowCmdM env err m
454 -> m ([ContextId], [Indexed ContextId a])
455 insertDocs uId cId hs = do
456 let docs = map addUniqId hs
457 newIds <- insertDb uId Nothing docs
458 -- printDebug "newIds" newIds
460 newIds' = map reId newIds
461 documentsWithId = mergeData (toInserted newIds) (Map.fromList $ map viewUniqId' docs)
462 _ <- Doc.add cId newIds'
463 pure (newIds', documentsWithId)
466 ------------------------------------------------------------------------
467 viewUniqId' :: UniqId a
470 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
472 err = panic "[ERROR] Database.Flow.toInsert"
475 toInserted :: [ReturnId]
478 Map.fromList . map (\r -> (reUniqId r, r) )
479 . filter (\r -> reInserted r == True)
481 mergeData :: Map Hash ReturnId
483 -> [Indexed NodeId a]
484 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
486 toDocumentWithId (sha,hpd) =
487 Indexed <$> fmap reId (lookup sha rs)
490 ------------------------------------------------------------------------
491 ------------------------------------------------------------------------
492 ------------------------------------------------------------------------
493 documentIdWithNgrams :: HasNodeError err
495 -> Cmd err (HashMap b (Map NgramsType Int, TermsCount)))
496 -> [Indexed NodeId a]
497 -> Cmd err [DocumentIdWithNgrams a b]
498 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
500 toDocumentIdWithNgrams d = do
502 pure $ DocumentIdWithNgrams d e
505 -- | TODO check optimization
506 mapNodeIdNgrams :: (Ord b, Hashable b)
507 => [DocumentIdWithNgrams a b]
510 (Map NodeId (Int, TermsCount))
512 mapNodeIdNgrams = HashMap.unionsWith (Map.unionWith (Map.unionWith addTuples)) . fmap f
514 -- | NOTE We are somehow multiplying 'TermsCount' here: If the
515 -- same ngrams term has different ngrams types, the 'TermsCount'
516 -- for it (which is the number of times the terms appears in a
517 -- document) is copied over to all its types.
518 f :: DocumentIdWithNgrams a b
519 -> HashMap b (Map NgramsType (Map NodeId (Int, TermsCount)))
520 f d = fmap (\(ngramsTypeMap, cnt) -> fmap (\i -> Map.singleton nId (i, cnt)) ngramsTypeMap) $ documentNgrams d
522 nId = _index $ documentWithId d
525 ------------------------------------------------------------------------
526 instance ExtractNgramsT HyperdataContact
528 extractNgramsT l hc = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extract l hc
530 extract :: TermType Lang -> HyperdataContact
531 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int, TermsCount))
533 let authors = map text2ngrams
534 $ maybe ["Nothing"] (\a -> [a])
535 $ view (hc_who . _Just . cw_lastName) hc'
537 pure $ HashMap.fromList $ [(SimpleNgrams a', (Map.singleton Authors 1, 1)) | a' <- authors ]
540 instance ExtractNgramsT HyperdataDocument
542 extractNgramsT :: TermType Lang
544 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int, TermsCount))
545 extractNgramsT lang hd = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extractNgramsT' lang hd
547 extractNgramsT' :: TermType Lang
549 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int, TermsCount))
550 extractNgramsT' lang' doc = do
551 let source = text2ngrams
552 $ maybe "Nothing" identity
555 institutes = map text2ngrams
556 $ maybe ["Nothing"] (splitOn Institutes (doc^. hd_bdd))
559 authors = map text2ngrams
560 $ maybe ["Nothing"] (splitOn Authors (doc^. hd_bdd))
563 ncs <- view (nlpServerGet $ lang' ^. tt_lang)
565 termsWithCounts' <- map (\(t, cnt) -> (enrichedTerms (lang' ^. tt_lang) CoreNLP NP t, cnt))
567 <$> liftBase (extractTerms ncs lang' $ hasText doc)
569 pure $ HashMap.fromList
570 $ [(SimpleNgrams source, (Map.singleton Sources 1, 1)) ]
571 <> [(SimpleNgrams i', (Map.singleton Institutes 1, 1)) | i' <- institutes ]
572 <> [(SimpleNgrams a', (Map.singleton Authors 1, 1)) | a' <- authors ]
573 <> [(EnrichedNgrams t', (Map.singleton NgramsTerms 1, cnt')) | (t', cnt') <- termsWithCounts' ]
575 instance (ExtractNgramsT a, HasText a) => ExtractNgramsT (Node a)
577 extractNgramsT l (Node { _node_hyperdata = h }) = extractNgramsT l h
579 instance HasText a => HasText (Node a)
581 hasText (Node { _node_hyperdata = h }) = hasText h
585 -- | TODO putelsewhere
586 -- | Upgrade function
587 -- Suppose all documents are English (this is the case actually)
588 indexAllDocumentsWithPosTag :: FlowCmdM env err m
590 indexAllDocumentsWithPosTag = do
591 rootId <- getRootId (UserName userMaster)
592 corpusIds <- findNodesId rootId [NodeCorpus]
593 docs <- List.concat <$> mapM getDocumentsWithParentId corpusIds
594 _ <- mapM extractInsert (splitEvery 1000 docs)
597 extractInsert :: FlowCmdM env err m
598 => [Node HyperdataDocument] -> m ()
599 extractInsert docs = do
600 let documentsWithId = map (\doc -> Indexed (doc ^. node_id) doc) docs
601 mapNgramsDocs' <- mapNodeIdNgrams
602 <$> documentIdWithNgrams
603 (extractNgramsT $ withLang (Multi EN) documentsWithId)
605 _ <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'