2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
20 {-# LANGUAGE ConstrainedClassMethods #-}
21 {-# LANGUAGE ConstraintKinds #-}
22 {-# LANGUAGE InstanceSigs #-}
23 {-# LANGUAGE ScopedTypeVariables #-}
24 {-# LANGUAGE TemplateHaskell #-}
26 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
39 , getOrMk_RootWithCorpus
45 , indexAllDocumentsWithPosTag
49 import Control.Lens ((^.), view, _Just, makeLenses)
50 import Data.Aeson.TH (deriveJSON)
52 import Data.HashMap.Strict (HashMap)
53 import Data.Hashable (Hashable)
54 import Data.List (concat)
55 import Data.Map (Map, lookup)
56 import Data.Maybe (catMaybes)
59 import Data.Text (splitOn)
60 import Data.Traversable (traverse)
61 import Data.Tuple.Extra (first, second)
62 import GHC.Generics (Generic)
63 import System.FilePath (FilePath)
64 import qualified Data.HashMap.Strict as HashMap
65 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
66 import qualified Data.Map as Map
68 import Gargantext.Core (Lang(..), PosTagAlgo(..))
69 import Gargantext.Core.Ext.IMT (toSchoolName)
70 import Gargantext.Core.Ext.IMTUser (readFile_Annuaire)
71 import Gargantext.Core.Flow.Types
72 import Gargantext.Core.Text
73 import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
74 import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat)
75 import Gargantext.Core.Text.List (buildNgramsLists)
76 import Gargantext.Core.Text.Terms
77 import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
78 import Gargantext.Core.Types (POS(NP))
79 import Gargantext.Core.Types.Individu (User(..))
80 import Gargantext.Core.Types.Main
81 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
82 import Gargantext.Database.Action.Flow.List
83 import Gargantext.Database.Action.Flow.Types
84 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams, DocumentIdWithNgrams(..))
85 import Gargantext.Database.Action.Search (searchDocInDatabase)
86 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
87 import Gargantext.Database.Admin.Types.Hyperdata
88 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
89 import Gargantext.Database.Prelude
90 import Gargantext.Database.Query.Table.Ngrams
91 import Gargantext.Database.Query.Table.Node
92 import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
93 import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
94 import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
95 import Gargantext.Database.Query.Table.NodeNodeNgrams2
96 import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
97 import Gargantext.Database.Schema.Node (NodePoly(..), node_id)
98 import Gargantext.Database.Types
99 import Gargantext.Prelude
100 import Gargantext.Prelude.Crypto.Hash (Hash)
101 import qualified Gargantext.Core.Text.Corpus.API as API
102 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
104 ------------------------------------------------------------------------
105 -- Impots for upgrade function
106 import Gargantext.Database.Query.Tree.Root (getRootId)
107 import Gargantext.Database.Query.Tree (findNodesId)
108 import qualified Data.List as List
109 ------------------------------------------------------------------------
110 -- TODO use internal with API name (could be old data)
111 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
112 | ExternalOrigin { _do_api :: API.ExternalAPIs }
114 deriving (Generic, Eq)
116 makeLenses ''DataOrigin
117 deriveJSON (unPrefix "_do_") ''DataOrigin
118 instance ToSchema DataOrigin where
119 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
121 allDataOrigins :: [DataOrigin]
122 allDataOrigins = map InternalOrigin API.externalAPIs
123 <> map ExternalOrigin API.externalAPIs
126 data DataText = DataOld ![NodeId]
127 | DataNew ![[HyperdataDocument]]
129 -- TODO use the split parameter in config file
130 getDataText :: FlowCmdM env err m
136 getDataText (ExternalOrigin api) la q li = liftBase $ DataNew
138 <$> API.get api (_tt_lang la) q li
140 getDataText (InternalOrigin _) _la q _li = do
141 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
142 (UserName userMaster)
144 (Nothing :: Maybe HyperdataCorpus)
145 ids <- map fst <$> searchDocInDatabase cId (stemIt q)
148 -------------------------------------------------------------------------------
149 flowDataText :: ( FlowCmdM env err m
156 flowDataText u (DataOld ids) tt cid = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids
158 corpusType = (Nothing :: Maybe HyperdataCorpus)
159 flowDataText u (DataNew txt) tt cid = flowCorpus u (Right [cid]) tt txt
161 ------------------------------------------------------------------------
163 flowAnnuaire :: (FlowCmdM env err m)
165 -> Either CorpusName [CorpusId]
169 flowAnnuaire u n l filePath = do
170 docs <- liftBase $ (( splitEvery 500 <$> readFile_Annuaire filePath) :: IO [[HyperdataContact]])
171 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
173 ------------------------------------------------------------------------
174 flowCorpusFile :: (FlowCmdM env err m)
176 -> Either CorpusName [CorpusId]
177 -> Limit -- Limit the number of docs (for dev purpose)
178 -> TermType Lang -> FileFormat -> FilePath
180 flowCorpusFile u n l la ff fp = do
181 docs <- liftBase ( splitEvery 500
185 flowCorpus u n la (map (map toHyperdataDocument) docs)
187 ------------------------------------------------------------------------
188 -- | TODO improve the needed type to create/update a corpus
189 -- (For now, Either is enough)
190 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
192 -> Either CorpusName [CorpusId]
196 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
199 flow :: ( FlowCmdM env err m
205 -> Either CorpusName [CorpusId]
209 flow c u cn la docs = do
210 -- TODO if public insertMasterDocs else insertUserDocs
211 ids <- traverse (insertMasterDocs c la) docs
212 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
214 ------------------------------------------------------------------------
215 flowCorpusUser :: ( FlowCmdM env err m
220 -> Either CorpusName [CorpusId]
224 flowCorpusUser l user corpusName ctype ids = do
226 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
227 -- NodeTexts is first
228 _tId <- insertDefaultNode NodeTexts userCorpusId userId
229 -- printDebug "NodeTexts: " tId
231 -- NodeList is second
232 listId <- getOrMkList userCorpusId userId
233 -- _cooc <- insertDefaultNode NodeListCooc listId userId
234 -- TODO: check if present already, ignore
235 _ <- Doc.add userCorpusId ids
237 -- printDebug "Node Text Ids:" tId
240 (masterUserId, _masterRootId, masterCorpusId)
241 <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
243 --let gp = (GroupParams l 2 3 (StopSize 3))
244 let gp = GroupWithPosTag l CoreNLP HashMap.empty
245 ngs <- buildNgramsLists gp user userCorpusId masterCorpusId
247 _userListId <- flowList_DbRepo listId ngs
248 _mastListId <- getOrMkList masterCorpusId masterUserId
249 -- _ <- insertOccsUpdates userCorpusId mastListId
250 -- printDebug "userListId" userListId
252 _ <- insertDefaultNode NodeDashboard userCorpusId userId
253 _ <- insertDefaultNode NodeGraph userCorpusId userId
254 --_ <- mkPhylo userCorpusId userId
256 -- _ <- mkAnnuaire rootUserId userId
260 insertMasterDocs :: ( FlowCmdM env err m
268 insertMasterDocs c lang hs = do
269 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
270 (ids', documentsWithId) <- insertDocs masterUserId masterCorpusId (map (toNode masterUserId masterCorpusId) hs )
271 _ <- Doc.add masterCorpusId ids'
273 -- create a corpus with database name (CSV or PubMed)
274 -- add documents to the corpus (create node_node link)
275 -- this will enable global database monitoring
277 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
278 mapNgramsDocs' :: HashMap ExtractedNgrams (Map NgramsType (Map NodeId Int))
280 <$> documentIdWithNgrams
281 (extractNgramsT $ withLang lang documentsWithId)
284 lId <- getOrMkList masterCorpusId masterUserId
285 _ <- saveDocNgramsWith lId mapNgramsDocs'
287 -- _cooc <- insertDefaultNode NodeListCooc lId masterUserId
290 saveDocNgramsWith :: ( FlowCmdM env err m)
292 -> HashMap ExtractedNgrams (Map NgramsType (Map NodeId Int))
294 saveDocNgramsWith lId mapNgramsDocs' = do
295 terms2id <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
296 let mapNgramsDocs = HashMap.mapKeys extracted2ngrams mapNgramsDocs'
299 let indexedNgrams = HashMap.mapKeys (indexNgrams terms2id) mapNgramsDocs
302 mapCgramsId <- listInsertDb lId toNodeNgramsW'
303 $ map (first _ngramsTerms . second Map.keys)
304 $ HashMap.toList mapNgramsDocs
307 _return <- insertNodeNodeNgrams2
308 $ catMaybes [ NodeNodeNgrams2 <$> Just nId
309 <*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms'')
310 <*> Just (fromIntegral w :: Double)
311 | (terms'', mapNgramsTypes) <- HashMap.toList mapNgramsDocs
312 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
313 , (nId, w) <- Map.toList mapNodeIdWeight
316 _ <- insertDocNgrams lId indexedNgrams
321 ------------------------------------------------------------------------
322 -- TODO Type NodeDocumentUnicised
323 insertDocs :: ( FlowCmdM env err m
330 -> m ([DocId], [Indexed NodeId a])
331 insertDocs uId cId hs = do
332 let docs = map addUniqId hs
333 newIds <- insertDb uId cId docs
334 -- printDebug "newIds" newIds
336 newIds' = map reId newIds
337 documentsWithId = mergeData (toInserted newIds) (Map.fromList $ map viewUniqId' docs)
338 _ <- Doc.add cId newIds'
339 pure (newIds', documentsWithId)
342 ------------------------------------------------------------------------
343 viewUniqId' :: UniqId a
346 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
348 err = panic "[ERROR] Database.Flow.toInsert"
351 toInserted :: [ReturnId]
354 Map.fromList . map (\r -> (reUniqId r, r) )
355 . filter (\r -> reInserted r == True)
357 mergeData :: Map Hash ReturnId
359 -> [Indexed NodeId a]
360 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
362 toDocumentWithId (sha,hpd) =
363 Indexed <$> fmap reId (lookup sha rs)
366 ------------------------------------------------------------------------
367 ------------------------------------------------------------------------
368 ------------------------------------------------------------------------
369 documentIdWithNgrams :: HasNodeError err
371 -> Cmd err (HashMap b (Map NgramsType Int)))
372 -> [Indexed NodeId a]
373 -> Cmd err [DocumentIdWithNgrams a b]
374 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
376 toDocumentIdWithNgrams d = do
378 pure $ DocumentIdWithNgrams d e
381 -- | TODO check optimization
382 mapNodeIdNgrams :: (Ord b, Hashable b)
383 => [DocumentIdWithNgrams a b]
388 mapNodeIdNgrams = HashMap.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
390 f :: DocumentIdWithNgrams a b
391 -> HashMap b (Map NgramsType (Map NodeId Int))
392 f d = fmap (fmap (Map.singleton nId)) $ documentNgrams d
394 nId = _index $ documentWithId d
397 ------------------------------------------------------------------------
398 instance ExtractNgramsT HyperdataContact
400 extractNgramsT l hc = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extract l hc
402 extract :: TermType Lang -> HyperdataContact
403 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
405 let authors = map text2ngrams
406 $ maybe ["Nothing"] (\a -> [a])
407 $ view (hc_who . _Just . cw_lastName) hc'
409 pure $ HashMap.fromList $ [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ]
412 instance ExtractNgramsT HyperdataDocument
414 extractNgramsT :: TermType Lang
416 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
417 extractNgramsT lang hd = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extractNgramsT' lang hd
419 extractNgramsT' :: TermType Lang
421 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
422 extractNgramsT' lang' doc = do
423 let source = text2ngrams
424 $ maybe "Nothing" identity
427 institutes = map text2ngrams
428 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
431 authors = map text2ngrams
432 $ maybe ["Nothing"] (splitOn ", ")
435 terms' <- map (enrichedTerms (lang' ^. tt_lang) CoreNLP NP)
437 <$> liftBase (extractTerms lang' $ hasText doc)
439 pure $ HashMap.fromList
440 $ [(SimpleNgrams source, Map.singleton Sources 1) ]
441 <> [(SimpleNgrams i', Map.singleton Institutes 1) | i' <- institutes ]
442 <> [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ]
443 <> [(EnrichedNgrams t', Map.singleton NgramsTerms 1) | t' <- terms' ]
445 instance (ExtractNgramsT a, HasText a) => ExtractNgramsT (Node a)
447 extractNgramsT l (Node _ _ _ _ _ _ _ h) = extractNgramsT l h
449 instance HasText a => HasText (Node a)
451 hasText (Node _ _ _ _ _ _ _ h) = hasText h
455 -- | TODO putelsewhere
456 -- | Upgrade function
457 -- Suppose all documents are English (this is the case actually)
458 indexAllDocumentsWithPosTag :: FlowCmdM env err m => m ()
459 indexAllDocumentsWithPosTag = do
460 rootId <- getRootId (UserName userMaster)
461 corpusIds <- findNodesId rootId [NodeCorpus]
462 docs <- List.concat <$> mapM getDocumentsWithParentId corpusIds
464 _ <- mapM extractInsert (splitEvery 1000 docs)
468 extractInsert :: FlowCmdM env err m => [Node HyperdataDocument] -> m ()
469 extractInsert docs = do
470 let documentsWithId = map (\doc -> Indexed (doc ^. node_id) doc) docs
472 mapNgramsDocs' <- mapNodeIdNgrams
473 <$> documentIdWithNgrams
474 (extractNgramsT $ withLang (Multi EN) documentsWithId)
477 _ <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'