2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
20 {-# LANGUAGE ConstrainedClassMethods #-}
21 {-# LANGUAGE ConstraintKinds #-}
22 {-# LANGUAGE InstanceSigs #-}
23 {-# LANGUAGE ScopedTypeVariables #-}
24 {-# LANGUAGE TemplateHaskell #-}
26 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
38 , getOrMk_RootWithCorpus
44 , indexAllDocumentsWithPosTag
48 import Control.Lens ((^.), view, _Just, makeLenses)
49 import Data.Aeson.TH (deriveJSON)
51 import Data.HashMap.Strict (HashMap)
52 import Data.Hashable (Hashable)
53 import Data.List (concat)
54 import Data.Map (Map, lookup)
55 import Data.Maybe (catMaybes)
58 import Data.Text (splitOn)
59 import Data.Traversable (traverse)
60 import Data.Tuple.Extra (first, second)
61 import GHC.Generics (Generic)
62 import System.FilePath (FilePath)
63 import qualified Data.HashMap.Strict as HashMap
64 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
65 import qualified Data.Map as Map
67 import Gargantext.Core (Lang(..), PosTagAlgo(..))
68 import Gargantext.Core.Ext.IMT (toSchoolName)
69 import Gargantext.Core.Ext.IMTUser (readFile_Annuaire)
70 import Gargantext.Core.Flow.Types
71 import Gargantext.Core.Text
72 import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
73 import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat)
74 import Gargantext.Core.Text.List (buildNgramsLists)
75 import Gargantext.Core.Text.Terms
76 import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
77 import Gargantext.Core.Types (POS(NP))
78 import Gargantext.Core.Types.Individu (User(..))
79 import Gargantext.Core.Types.Main
80 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
81 import Gargantext.Database.Action.Flow.List
82 import Gargantext.Database.Action.Flow.Types
83 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams, DocumentIdWithNgrams(..))
84 import Gargantext.Database.Action.Search (searchDocInDatabase)
85 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
86 import Gargantext.Database.Admin.Types.Hyperdata
87 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
88 import Gargantext.Database.Prelude
89 import Gargantext.Database.Query.Table.Ngrams
90 import Gargantext.Database.Query.Table.Node
91 import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
92 import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
93 import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
94 import Gargantext.Database.Query.Table.NodeNodeNgrams2
95 import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
96 import Gargantext.Database.Schema.Node (NodePoly(..), node_id)
97 import Gargantext.Database.Types
98 import Gargantext.Prelude
99 import Gargantext.Prelude.Crypto.Hash (Hash)
100 import qualified Gargantext.Core.Text.Corpus.API as API
101 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
103 ------------------------------------------------------------------------
104 -- Impots for upgrade function
105 import Gargantext.Database.Query.Tree.Root (getRootId)
106 import Gargantext.Database.Query.Tree (findNodesId)
107 import qualified Data.List as List
108 ------------------------------------------------------------------------
109 -- TODO use internal with API name (could be old data)
110 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
111 | ExternalOrigin { _do_api :: API.ExternalAPIs }
113 deriving (Generic, Eq)
115 makeLenses ''DataOrigin
116 deriveJSON (unPrefix "_do_") ''DataOrigin
117 instance ToSchema DataOrigin where
118 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
120 allDataOrigins :: [DataOrigin]
121 allDataOrigins = map InternalOrigin API.externalAPIs
122 <> map ExternalOrigin API.externalAPIs
125 data DataText = DataOld ![NodeId]
126 | DataNew ![[HyperdataDocument]]
128 -- TODO use the split parameter in config file
129 getDataText :: FlowCmdM env err m
135 getDataText (ExternalOrigin api) la q li = liftBase $ DataNew
137 <$> API.get api (_tt_lang la) q li
139 getDataText (InternalOrigin _) _la q _li = do
140 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
141 (UserName userMaster)
143 (Nothing :: Maybe HyperdataCorpus)
144 ids <- map fst <$> searchDocInDatabase cId (stemIt q)
147 -------------------------------------------------------------------------------
148 flowDataText :: ( FlowCmdM env err m
155 flowDataText u (DataOld ids) tt cid = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids
157 corpusType = (Nothing :: Maybe HyperdataCorpus)
158 flowDataText u (DataNew txt) tt cid = flowCorpus u (Right [cid]) tt txt
160 ------------------------------------------------------------------------
162 flowAnnuaire :: (FlowCmdM env err m)
164 -> Either CorpusName [CorpusId]
168 flowAnnuaire u n l filePath = do
169 docs <- liftBase $ (( splitEvery 500 <$> readFile_Annuaire filePath) :: IO [[HyperdataContact]])
170 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
172 ------------------------------------------------------------------------
173 flowCorpusFile :: (FlowCmdM env err m)
175 -> Either CorpusName [CorpusId]
176 -> Limit -- Limit the number of docs (for dev purpose)
177 -> TermType Lang -> FileFormat -> FilePath
179 flowCorpusFile u n l la ff fp = do
180 docs <- liftBase ( splitEvery 500
184 flowCorpus u n la (map (map toHyperdataDocument) docs)
186 ------------------------------------------------------------------------
187 -- | TODO improve the needed type to create/update a corpus
188 -- (For now, Either is enough)
189 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
191 -> Either CorpusName [CorpusId]
195 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
198 flow :: ( FlowCmdM env err m
204 -> Either CorpusName [CorpusId]
208 flow c u cn la docs = do
209 -- TODO if public insertMasterDocs else insertUserDocs
210 ids <- traverse (insertMasterDocs c la) docs
211 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
213 ------------------------------------------------------------------------
214 flowCorpusUser :: ( FlowCmdM env err m
219 -> Either CorpusName [CorpusId]
223 flowCorpusUser l user corpusName ctype ids = do
225 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
226 -- NodeTexts is first
227 _tId <- insertDefaultNode NodeTexts userCorpusId userId
228 -- printDebug "NodeTexts: " tId
230 -- NodeList is second
231 listId <- getOrMkList userCorpusId userId
232 -- _cooc <- insertDefaultNode NodeListCooc listId userId
233 -- TODO: check if present already, ignore
234 _ <- Doc.add userCorpusId ids
236 -- printDebug "Node Text Ids:" tId
239 (masterUserId, _masterRootId, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
241 --let gp = (GroupParams l 2 3 (StopSize 3))
242 let gp = GroupWithPosTag l CoreNLP HashMap.empty
243 ngs <- buildNgramsLists gp user userCorpusId masterCorpusId
245 _userListId <- flowList_DbRepo listId ngs
246 _mastListId <- getOrMkList masterCorpusId masterUserId
247 -- _ <- insertOccsUpdates userCorpusId mastListId
248 -- printDebug "userListId" userListId
250 _ <- insertDefaultNode NodeDashboard userCorpusId userId
251 _ <- insertDefaultNode NodeGraph userCorpusId userId
252 --_ <- mkPhylo userCorpusId userId
254 -- _ <- mkAnnuaire rootUserId userId
258 insertMasterDocs :: ( FlowCmdM env err m
266 insertMasterDocs c lang hs = do
267 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
268 (ids', documentsWithId) <- insertDocs masterUserId masterCorpusId (map (toNode masterUserId masterCorpusId) hs )
269 _ <- Doc.add masterCorpusId ids'
271 -- create a corpus with database name (CSV or PubMed)
272 -- add documents to the corpus (create node_node link)
273 -- this will enable global database monitoring
275 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
276 mapNgramsDocs' :: HashMap ExtractedNgrams (Map NgramsType (Map NodeId Int))
278 <$> documentIdWithNgrams
279 (extractNgramsT $ withLang lang documentsWithId)
282 terms2id <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
283 let mapNgramsDocs = HashMap.mapKeys extracted2ngrams mapNgramsDocs'
286 let indexedNgrams = HashMap.mapKeys (indexNgrams terms2id) mapNgramsDocs
289 lId <- getOrMkList masterCorpusId masterUserId
290 mapCgramsId <- listInsertDb lId toNodeNgramsW'
291 $ map (first _ngramsTerms . second Map.keys)
292 $ HashMap.toList mapNgramsDocs
294 _return <- insertNodeNodeNgrams2
295 $ catMaybes [ NodeNodeNgrams2 <$> Just nId
296 <*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms'')
297 <*> Just (fromIntegral w :: Double)
298 | (terms'', mapNgramsTypes) <- HashMap.toList mapNgramsDocs
299 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
300 , (nId, w) <- Map.toList mapNodeIdWeight
303 -- _cooc <- insertDefaultNode NodeListCooc lId masterUserId
305 _ <- insertDocNgrams lId indexedNgrams
308 ------------------------------------------------------------------------
309 -- TODO Type NodeDocumentUnicised
310 insertDocs :: ( FlowCmdM env err m
317 -> m ([DocId], [Indexed NodeId a])
318 insertDocs uId cId hs = do
319 let docs = map addUniqId hs
320 newIds <- insertDb uId cId docs
321 -- printDebug "newIds" newIds
323 newIds' = map reId newIds
324 documentsWithId = mergeData (toInserted newIds) (Map.fromList $ map viewUniqId' docs)
325 _ <- Doc.add cId newIds'
326 pure (newIds', documentsWithId)
329 ------------------------------------------------------------------------
330 viewUniqId' :: UniqId a
333 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
335 err = panic "[ERROR] Database.Flow.toInsert"
338 toInserted :: [ReturnId]
341 Map.fromList . map (\r -> (reUniqId r, r) )
342 . filter (\r -> reInserted r == True)
344 mergeData :: Map Hash ReturnId
346 -> [Indexed NodeId a]
347 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
349 toDocumentWithId (sha,hpd) =
350 Indexed <$> fmap reId (lookup sha rs)
353 ------------------------------------------------------------------------
354 ------------------------------------------------------------------------
355 ------------------------------------------------------------------------
356 documentIdWithNgrams :: HasNodeError err
358 -> Cmd err (HashMap b (Map NgramsType Int)))
359 -> [Indexed NodeId a]
360 -> Cmd err [DocumentIdWithNgrams a b]
361 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
363 toDocumentIdWithNgrams d = do
365 pure $ DocumentIdWithNgrams d e
368 -- | TODO check optimization
369 mapNodeIdNgrams :: (Ord b, Hashable b)
370 => [DocumentIdWithNgrams a b]
375 mapNodeIdNgrams = HashMap.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
377 f :: DocumentIdWithNgrams a b
378 -> HashMap b (Map NgramsType (Map NodeId Int))
379 f d = fmap (fmap (Map.singleton nId)) $ documentNgrams d
381 nId = _index $ documentWithId d
384 ------------------------------------------------------------------------
385 instance ExtractNgramsT HyperdataContact
387 extractNgramsT l hc = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extract l hc
389 extract :: TermType Lang -> HyperdataContact
390 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
392 let authors = map text2ngrams
393 $ maybe ["Nothing"] (\a -> [a])
394 $ view (hc_who . _Just . cw_lastName) hc'
396 pure $ HashMap.fromList $ [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ]
399 instance ExtractNgramsT HyperdataDocument
401 extractNgramsT :: TermType Lang
403 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
404 extractNgramsT lang hd = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extractNgramsT' lang hd
406 extractNgramsT' :: TermType Lang
408 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
409 extractNgramsT' lang' doc = do
410 let source = text2ngrams
411 $ maybe "Nothing" identity
414 institutes = map text2ngrams
415 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
418 authors = map text2ngrams
419 $ maybe ["Nothing"] (splitOn ", ")
422 terms' <- map (enrichedTerms (lang' ^. tt_lang) CoreNLP NP)
424 <$> liftBase (extractTerms lang' $ hasText doc)
426 pure $ HashMap.fromList
427 $ [(SimpleNgrams source, Map.singleton Sources 1) ]
428 <> [(SimpleNgrams i', Map.singleton Institutes 1) | i' <- institutes ]
429 <> [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ]
430 <> [(EnrichedNgrams t', Map.singleton NgramsTerms 1) | t' <- terms' ]
432 instance (ExtractNgramsT a, HasText a) => ExtractNgramsT (Node a)
434 extractNgramsT l (Node _ _ _ _ _ _ _ h) = extractNgramsT l h
436 instance HasText a => HasText (Node a)
438 hasText (Node _ _ _ _ _ _ _ h) = hasText h
442 -- | TODO putelsewhere
443 -- | Upgrade function
444 -- Suppose all documents are English (this is the case actually)
445 indexAllDocumentsWithPosTag :: FlowCmdM env err m => m ()
446 indexAllDocumentsWithPosTag = do
447 rootId <- getRootId (UserName userMaster)
448 corpusIds <- findNodesId rootId [NodeCorpus]
449 docs <- List.concat <$> mapM getDocumentsWithParentId corpusIds
451 _ <- mapM extractInsert (splitEvery 1000 docs)
455 extractInsert :: FlowCmdM env err m => [Node HyperdataDocument] -> m ()
456 extractInsert docs = do
457 let documentsWithId = map (\doc -> Indexed (doc ^. node_id) doc) docs
459 mapNgramsDocs' <- mapNodeIdNgrams
460 <$> documentIdWithNgrams
461 (extractNgramsT $ withLang (Multi EN) documentsWithId)
464 _ <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'