2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
20 {-# LANGUAGE ConstrainedClassMethods #-}
21 {-# LANGUAGE ConstraintKinds #-}
22 {-# LANGUAGE InstanceSigs #-}
23 {-# LANGUAGE ScopedTypeVariables #-}
24 {-# LANGUAGE TemplateHaskell #-}
26 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
38 , getOrMk_RootWithCorpus
44 , indexAllDocumentsWithPosTag
48 import Control.Lens ((^.), view, _Just, makeLenses)
49 import Data.Aeson.TH (deriveJSON)
51 import Data.HashMap.Strict (HashMap)
52 import Data.Hashable (Hashable)
53 import Data.List (concat)
54 import Data.Map (Map, lookup)
55 import Data.Maybe (catMaybes)
58 import Data.Text (splitOn)
59 import Data.Traversable (traverse)
60 import Data.Tuple.Extra (first, second)
61 import GHC.Generics (Generic)
62 import System.FilePath (FilePath)
63 import qualified Data.HashMap.Strict as HashMap
64 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
65 import qualified Data.Map as Map
67 import Gargantext.Core (Lang(..), PosTagAlgo(..))
68 import Gargantext.Core.Ext.IMT (toSchoolName)
69 import Gargantext.Core.Ext.IMTUser (readFile_Annuaire)
70 import Gargantext.Core.Flow.Types
71 import Gargantext.Core.Text
72 import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
73 import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat)
74 import Gargantext.Core.Text.List (buildNgramsLists)
75 import Gargantext.Core.Text.Terms
76 import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
77 import Gargantext.Core.Types (POS(NP))
78 import Gargantext.Core.Types.Individu (User(..))
79 import Gargantext.Core.Types.Main
80 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
81 import Gargantext.Database.Action.Flow.List
82 import Gargantext.Database.Action.Flow.Types
83 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams, DocumentIdWithNgrams(..))
84 import Gargantext.Database.Action.Search (searchDocInDatabase)
85 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
86 import Gargantext.Database.Admin.Types.Hyperdata
87 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
88 import Gargantext.Database.Prelude
89 import Gargantext.Database.Query.Table.Ngrams
90 import Gargantext.Database.Query.Table.Node
91 import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
92 import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
93 import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
94 import Gargantext.Database.Query.Table.NodeNodeNgrams2
95 import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
96 import Gargantext.Database.Schema.Node (NodePoly(..), node_id)
97 import Gargantext.Database.Types
98 import Gargantext.Prelude
99 import Gargantext.Prelude.Crypto.Hash (Hash)
100 import qualified Gargantext.Core.Text.Corpus.API as API
101 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
103 ------------------------------------------------------------------------
104 -- Impots for upgrade function
105 import Gargantext.Database.Query.Tree.Root (getRootId)
106 import Gargantext.Database.Query.Tree (findNodesId)
107 import qualified Data.List as List
108 ------------------------------------------------------------------------
109 -- TODO use internal with API name (could be old data)
110 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
111 | ExternalOrigin { _do_api :: API.ExternalAPIs }
113 deriving (Generic, Eq)
115 makeLenses ''DataOrigin
116 deriveJSON (unPrefix "_do_") ''DataOrigin
117 instance ToSchema DataOrigin where
118 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
120 allDataOrigins :: [DataOrigin]
121 allDataOrigins = map InternalOrigin API.externalAPIs
122 <> map ExternalOrigin API.externalAPIs
125 data DataText = DataOld ![NodeId]
126 | DataNew ![[HyperdataDocument]]
128 -- TODO use the split parameter in config file
129 getDataText :: FlowCmdM env err m
135 getDataText (ExternalOrigin api) la q li = liftBase $ DataNew
137 <$> API.get api (_tt_lang la) q li
139 getDataText (InternalOrigin _) _la q _li = do
140 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
141 (UserName userMaster)
143 (Nothing :: Maybe HyperdataCorpus)
144 ids <- map fst <$> searchDocInDatabase cId (stemIt q)
147 -------------------------------------------------------------------------------
148 flowDataText :: ( FlowCmdM env err m
155 flowDataText u (DataOld ids) tt cid = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids
157 corpusType = (Nothing :: Maybe HyperdataCorpus)
158 flowDataText u (DataNew txt) tt cid = flowCorpus u (Right [cid]) tt txt
160 ------------------------------------------------------------------------
162 flowAnnuaire :: (FlowCmdM env err m)
164 -> Either CorpusName [CorpusId]
168 flowAnnuaire u n l filePath = do
169 docs <- liftBase $ (( splitEvery 500 <$> readFile_Annuaire filePath) :: IO [[HyperdataContact]])
170 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
172 ------------------------------------------------------------------------
173 flowCorpusFile :: (FlowCmdM env err m)
175 -> Either CorpusName [CorpusId]
176 -> Limit -- Limit the number of docs (for dev purpose)
177 -> TermType Lang -> FileFormat -> FilePath
179 flowCorpusFile u n l la ff fp = do
180 docs <- liftBase ( splitEvery 500
184 flowCorpus u n la (map (map toHyperdataDocument) docs)
186 ------------------------------------------------------------------------
187 -- | TODO improve the needed type to create/update a corpus
188 -- (For now, Either is enough)
189 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
191 -> Either CorpusName [CorpusId]
195 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
198 flow :: ( FlowCmdM env err m
204 -> Either CorpusName [CorpusId]
208 flow c u cn la docs = do
209 -- TODO if public insertMasterDocs else insertUserDocs
210 ids <- traverse (insertMasterDocs c la) docs
211 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
213 ------------------------------------------------------------------------
214 flowCorpusUser :: ( FlowCmdM env err m
219 -> Either CorpusName [CorpusId]
223 flowCorpusUser l user corpusName ctype ids = do
225 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
226 -- NodeTexts is first
227 _tId <- insertDefaultNode NodeTexts userCorpusId userId
228 -- printDebug "NodeTexts: " tId
230 -- NodeList is second
231 listId <- getOrMkList userCorpusId userId
232 -- _cooc <- insertDefaultNode NodeListCooc listId userId
233 -- TODO: check if present already, ignore
234 _ <- Doc.add userCorpusId ids
236 -- printDebug "Node Text Ids:" tId
239 (masterUserId, _masterRootId, masterCorpusId)
240 <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
242 --let gp = (GroupParams l 2 3 (StopSize 3))
243 let gp = GroupWithPosTag l CoreNLP HashMap.empty
244 ngs <- buildNgramsLists gp user userCorpusId masterCorpusId
246 _userListId <- flowList_DbRepo listId ngs
247 _mastListId <- getOrMkList masterCorpusId masterUserId
248 -- _ <- insertOccsUpdates userCorpusId mastListId
249 -- printDebug "userListId" userListId
251 _ <- insertDefaultNode NodeDashboard userCorpusId userId
252 _ <- insertDefaultNode NodeGraph userCorpusId userId
253 --_ <- mkPhylo userCorpusId userId
255 -- _ <- mkAnnuaire rootUserId userId
259 insertMasterDocs :: ( FlowCmdM env err m
267 insertMasterDocs c lang hs = do
268 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
269 (ids', documentsWithId) <- insertDocs masterUserId masterCorpusId (map (toNode masterUserId masterCorpusId) hs )
270 _ <- Doc.add masterCorpusId ids'
272 -- create a corpus with database name (CSV or PubMed)
273 -- add documents to the corpus (create node_node link)
274 -- this will enable global database monitoring
276 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
277 mapNgramsDocs' :: HashMap ExtractedNgrams (Map NgramsType (Map NodeId Int))
279 <$> documentIdWithNgrams
280 (extractNgramsT $ withLang lang documentsWithId)
283 terms2id <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
284 let mapNgramsDocs = HashMap.mapKeys extracted2ngrams mapNgramsDocs'
287 let indexedNgrams = HashMap.mapKeys (indexNgrams terms2id) mapNgramsDocs
290 lId <- getOrMkList masterCorpusId masterUserId
291 mapCgramsId <- listInsertDb lId toNodeNgramsW'
292 $ map (first _ngramsTerms . second Map.keys)
293 $ HashMap.toList mapNgramsDocs
295 _return <- insertNodeNodeNgrams2
296 $ catMaybes [ NodeNodeNgrams2 <$> Just nId
297 <*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms'')
298 <*> Just (fromIntegral w :: Double)
299 | (terms'', mapNgramsTypes) <- HashMap.toList mapNgramsDocs
300 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
301 , (nId, w) <- Map.toList mapNodeIdWeight
304 -- _cooc <- insertDefaultNode NodeListCooc lId masterUserId
306 _ <- insertDocNgrams lId indexedNgrams
309 ------------------------------------------------------------------------
310 -- TODO Type NodeDocumentUnicised
311 insertDocs :: ( FlowCmdM env err m
318 -> m ([DocId], [Indexed NodeId a])
319 insertDocs uId cId hs = do
320 let docs = map addUniqId hs
321 newIds <- insertDb uId cId docs
322 -- printDebug "newIds" newIds
324 newIds' = map reId newIds
325 documentsWithId = mergeData (toInserted newIds) (Map.fromList $ map viewUniqId' docs)
326 _ <- Doc.add cId newIds'
327 pure (newIds', documentsWithId)
330 ------------------------------------------------------------------------
331 viewUniqId' :: UniqId a
334 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
336 err = panic "[ERROR] Database.Flow.toInsert"
339 toInserted :: [ReturnId]
342 Map.fromList . map (\r -> (reUniqId r, r) )
343 . filter (\r -> reInserted r == True)
345 mergeData :: Map Hash ReturnId
347 -> [Indexed NodeId a]
348 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
350 toDocumentWithId (sha,hpd) =
351 Indexed <$> fmap reId (lookup sha rs)
354 ------------------------------------------------------------------------
355 ------------------------------------------------------------------------
356 ------------------------------------------------------------------------
357 documentIdWithNgrams :: HasNodeError err
359 -> Cmd err (HashMap b (Map NgramsType Int)))
360 -> [Indexed NodeId a]
361 -> Cmd err [DocumentIdWithNgrams a b]
362 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
364 toDocumentIdWithNgrams d = do
366 pure $ DocumentIdWithNgrams d e
369 -- | TODO check optimization
370 mapNodeIdNgrams :: (Ord b, Hashable b)
371 => [DocumentIdWithNgrams a b]
376 mapNodeIdNgrams = HashMap.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
378 f :: DocumentIdWithNgrams a b
379 -> HashMap b (Map NgramsType (Map NodeId Int))
380 f d = fmap (fmap (Map.singleton nId)) $ documentNgrams d
382 nId = _index $ documentWithId d
385 ------------------------------------------------------------------------
386 instance ExtractNgramsT HyperdataContact
388 extractNgramsT l hc = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extract l hc
390 extract :: TermType Lang -> HyperdataContact
391 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
393 let authors = map text2ngrams
394 $ maybe ["Nothing"] (\a -> [a])
395 $ view (hc_who . _Just . cw_lastName) hc'
397 pure $ HashMap.fromList $ [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ]
400 instance ExtractNgramsT HyperdataDocument
402 extractNgramsT :: TermType Lang
404 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
405 extractNgramsT lang hd = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extractNgramsT' lang hd
407 extractNgramsT' :: TermType Lang
409 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
410 extractNgramsT' lang' doc = do
411 let source = text2ngrams
412 $ maybe "Nothing" identity
415 institutes = map text2ngrams
416 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
419 authors = map text2ngrams
420 $ maybe ["Nothing"] (splitOn ", ")
423 terms' <- map (enrichedTerms (lang' ^. tt_lang) CoreNLP NP)
425 <$> liftBase (extractTerms lang' $ hasText doc)
427 pure $ HashMap.fromList
428 $ [(SimpleNgrams source, Map.singleton Sources 1) ]
429 <> [(SimpleNgrams i', Map.singleton Institutes 1) | i' <- institutes ]
430 <> [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ]
431 <> [(EnrichedNgrams t', Map.singleton NgramsTerms 1) | t' <- terms' ]
433 instance (ExtractNgramsT a, HasText a) => ExtractNgramsT (Node a)
435 extractNgramsT l (Node _ _ _ _ _ _ _ h) = extractNgramsT l h
437 instance HasText a => HasText (Node a)
439 hasText (Node _ _ _ _ _ _ _ h) = hasText h
443 -- | TODO putelsewhere
444 -- | Upgrade function
445 -- Suppose all documents are English (this is the case actually)
446 indexAllDocumentsWithPosTag :: FlowCmdM env err m => m ()
447 indexAllDocumentsWithPosTag = do
448 rootId <- getRootId (UserName userMaster)
449 corpusIds <- findNodesId rootId [NodeCorpus]
450 docs <- List.concat <$> mapM getDocumentsWithParentId corpusIds
452 _ <- mapM extractInsert (splitEvery 1000 docs)
456 extractInsert :: FlowCmdM env err m => [Node HyperdataDocument] -> m ()
457 extractInsert docs = do
458 let documentsWithId = map (\doc -> Indexed (doc ^. node_id) doc) docs
460 mapNgramsDocs' <- mapNodeIdNgrams
461 <$> documentIdWithNgrams
462 (extractNgramsT $ withLang (Multi EN) documentsWithId)
465 _ <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'