2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
20 {-# LANGUAGE ConstrainedClassMethods #-}
21 {-# LANGUAGE ConstraintKinds #-}
22 {-# LANGUAGE InstanceSigs #-}
23 {-# LANGUAGE ScopedTypeVariables #-}
24 {-# LANGUAGE TemplateHaskell #-}
26 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
39 , getOrMk_RootWithCorpus
45 , indexAllDocumentsWithPosTag
49 import Control.Lens ((^.), view, _Just, makeLenses)
50 import Data.Aeson.TH (deriveJSON)
52 import Data.HashMap.Strict (HashMap)
53 import Data.Hashable (Hashable)
54 import Data.List (concat)
55 import Data.Map (Map, lookup)
56 import Data.Maybe (catMaybes)
59 import qualified Data.Text as T
60 import Data.Traversable (traverse)
61 import Data.Tuple.Extra (first, second)
62 import GHC.Generics (Generic)
63 import System.FilePath (FilePath)
64 import qualified Data.HashMap.Strict as HashMap
65 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
66 import qualified Data.Map as Map
68 import Gargantext.Core (Lang(..), PosTagAlgo(..))
69 import Gargantext.Core.Ext.IMT (toSchoolName)
70 import Gargantext.Core.Ext.IMTUser (readFile_Annuaire)
71 import Gargantext.Core.Flow.Types
72 import Gargantext.Core.Text
73 import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
74 import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat)
75 import Gargantext.Core.Text.List (buildNgramsLists)
76 import Gargantext.Core.Text.List.Social (FlowSocialListWith)
77 import Gargantext.Core.Text.Terms
78 import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
79 import Gargantext.Core.Types (POS(NP))
80 import Gargantext.Core.Types.Individu (User(..))
81 import Gargantext.Core.Types.Main
82 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
83 import Gargantext.Database.Action.Flow.List
84 import Gargantext.Database.Action.Flow.Types
85 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams, DocumentIdWithNgrams(..))
86 import Gargantext.Database.Action.Search (searchDocInDatabase)
87 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
88 import Gargantext.Database.Admin.Types.Hyperdata
89 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
90 import Gargantext.Database.Prelude
91 import Gargantext.Database.Query.Table.Ngrams
92 import Gargantext.Database.Query.Table.Node
93 import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
94 import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
95 import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
96 import Gargantext.Database.Query.Table.NodeNodeNgrams2
97 import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
98 import Gargantext.Database.Schema.Node (NodePoly(..), node_id)
99 import Gargantext.Database.Types
100 import Gargantext.Prelude
101 import Gargantext.Prelude.Crypto.Hash (Hash)
102 import qualified Gargantext.Core.Text.Corpus.API as API
103 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
105 ------------------------------------------------------------------------
106 -- Imports for upgrade function
107 import Gargantext.Database.Query.Tree.Root (getRootId)
108 import Gargantext.Database.Query.Tree (findNodesId)
109 import qualified Data.List as List
110 ------------------------------------------------------------------------
111 -- TODO use internal with API name (could be old data)
112 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
113 | ExternalOrigin { _do_api :: API.ExternalAPIs }
115 deriving (Generic, Eq)
117 makeLenses ''DataOrigin
118 deriveJSON (unPrefix "_do_") ''DataOrigin
119 instance ToSchema DataOrigin where
120 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
122 allDataOrigins :: [DataOrigin]
123 allDataOrigins = map InternalOrigin API.externalAPIs
124 <> map ExternalOrigin API.externalAPIs
127 data DataText = DataOld ![NodeId]
128 | DataNew ![[HyperdataDocument]]
130 -- TODO use the split parameter in config file
131 getDataText :: FlowCmdM env err m
137 getDataText (ExternalOrigin api) la q li = liftBase $ DataNew
139 <$> API.get api (_tt_lang la) q li
141 getDataText (InternalOrigin _) _la q _li = do
142 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
143 (UserName userMaster)
145 (Nothing :: Maybe HyperdataCorpus)
146 ids <- map fst <$> searchDocInDatabase cId (stemIt q)
149 -------------------------------------------------------------------------------
150 flowDataText :: ( FlowCmdM env err m
156 -> Maybe FlowSocialListWith
158 flowDataText u (DataOld ids) tt cid mfslw = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids mfslw
160 corpusType = (Nothing :: Maybe HyperdataCorpus)
161 flowDataText u (DataNew txt) tt cid mfslw = flowCorpus u (Right [cid]) tt mfslw txt
163 ------------------------------------------------------------------------
165 flowAnnuaire :: (FlowCmdM env err m)
167 -> Either CorpusName [CorpusId]
171 flowAnnuaire u n l filePath = do
172 docs <- liftBase $ (( splitEvery 500 <$> readFile_Annuaire filePath) :: IO [[HyperdataContact]])
173 flow (Nothing :: Maybe HyperdataAnnuaire) u n l Nothing docs
175 ------------------------------------------------------------------------
176 flowCorpusFile :: (FlowCmdM env err m)
178 -> Either CorpusName [CorpusId]
179 -> Limit -- Limit the number of docs (for dev purpose)
180 -> TermType Lang -> FileFormat -> FilePath
181 -> Maybe FlowSocialListWith
183 flowCorpusFile u n l la ff fp mfslw = do
184 eParsed <- liftBase $ parseFile ff fp
187 let docs = splitEvery 500 $ take l parsed
188 flowCorpus u n la mfslw (map (map toHyperdataDocument) docs)
189 Left e -> panic $ "Error: " <> (T.pack e)
191 ------------------------------------------------------------------------
192 -- | TODO improve the needed type to create/update a corpus
193 -- (For now, Either is enough)
194 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
196 -> Either CorpusName [CorpusId]
198 -> Maybe FlowSocialListWith
201 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
204 flow :: ( FlowCmdM env err m
210 -> Either CorpusName [CorpusId]
212 -> Maybe FlowSocialListWith
215 flow c u cn la mfslw docs = do
216 -- TODO if public insertMasterDocs else insertUserDocs
217 ids <- traverse (insertMasterDocs c la) docs
218 flowCorpusUser (la ^. tt_lang) u cn c (concat ids) mfslw
220 ------------------------------------------------------------------------
221 flowCorpusUser :: ( FlowCmdM env err m
226 -> Either CorpusName [CorpusId]
229 -> Maybe FlowSocialListWith
231 flowCorpusUser l user corpusName ctype ids mfslw = do
233 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
234 -- NodeTexts is first
235 _tId <- insertDefaultNode NodeTexts userCorpusId userId
236 -- printDebug "NodeTexts: " tId
238 -- NodeList is second
239 listId <- getOrMkList userCorpusId userId
240 -- _cooc <- insertDefaultNode NodeListCooc listId userId
241 -- TODO: check if present already, ignore
242 _ <- Doc.add userCorpusId ids
244 -- printDebug "Node Text Ids:" tId
247 (masterUserId, _masterRootId, masterCorpusId)
248 <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
250 --let gp = (GroupParams l 2 3 (StopSize 3))
251 let gp = GroupWithPosTag l CoreNLP HashMap.empty
252 ngs <- buildNgramsLists user userCorpusId masterCorpusId mfslw gp
254 _userListId <- flowList_DbRepo listId ngs
255 _mastListId <- getOrMkList masterCorpusId masterUserId
256 -- _ <- insertOccsUpdates userCorpusId mastListId
257 -- printDebug "userListId" userListId
259 _ <- insertDefaultNode NodeDashboard userCorpusId userId
260 _ <- insertDefaultNode NodeGraph userCorpusId userId
261 --_ <- mkPhylo userCorpusId userId
263 -- _ <- mkAnnuaire rootUserId userId
267 insertMasterDocs :: ( FlowCmdM env err m
275 insertMasterDocs c lang hs = do
276 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
277 (ids', documentsWithId) <- insertDocs masterUserId masterCorpusId (map (toNode masterUserId masterCorpusId) hs )
278 _ <- Doc.add masterCorpusId ids'
280 -- create a corpus with database name (CSV or PubMed)
281 -- add documents to the corpus (create node_node link)
282 -- this will enable global database monitoring
284 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
285 mapNgramsDocs' :: HashMap ExtractedNgrams (Map NgramsType (Map NodeId Int))
287 <$> documentIdWithNgrams
288 (extractNgramsT $ withLang lang documentsWithId)
291 lId <- getOrMkList masterCorpusId masterUserId
292 _ <- saveDocNgramsWith lId mapNgramsDocs'
294 -- _cooc <- insertDefaultNode NodeListCooc lId masterUserId
297 saveDocNgramsWith :: ( FlowCmdM env err m)
299 -> HashMap ExtractedNgrams (Map NgramsType (Map NodeId Int))
301 saveDocNgramsWith lId mapNgramsDocs' = do
302 terms2id <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
303 let mapNgramsDocs = HashMap.mapKeys extracted2ngrams mapNgramsDocs'
306 let indexedNgrams = HashMap.mapKeys (indexNgrams terms2id) mapNgramsDocs
309 mapCgramsId <- listInsertDb lId toNodeNgramsW'
310 $ map (first _ngramsTerms . second Map.keys)
311 $ HashMap.toList mapNgramsDocs
314 _return <- insertNodeNodeNgrams2
315 $ catMaybes [ NodeNodeNgrams2 <$> Just nId
316 <*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms'')
317 <*> Just (fromIntegral w :: Double)
318 | (terms'', mapNgramsTypes) <- HashMap.toList mapNgramsDocs
319 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
320 , (nId, w) <- Map.toList mapNodeIdWeight
323 _ <- insertDocNgrams lId indexedNgrams
328 ------------------------------------------------------------------------
329 -- TODO Type NodeDocumentUnicised
330 insertDocs :: ( FlowCmdM env err m
337 -> m ([DocId], [Indexed NodeId a])
338 insertDocs uId cId hs = do
339 let docs = map addUniqId hs
340 newIds <- insertDb uId cId docs
341 -- printDebug "newIds" newIds
343 newIds' = map reId newIds
344 documentsWithId = mergeData (toInserted newIds) (Map.fromList $ map viewUniqId' docs)
345 _ <- Doc.add cId newIds'
346 pure (newIds', documentsWithId)
349 ------------------------------------------------------------------------
350 viewUniqId' :: UniqId a
353 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
355 err = panic "[ERROR] Database.Flow.toInsert"
358 toInserted :: [ReturnId]
361 Map.fromList . map (\r -> (reUniqId r, r) )
362 . filter (\r -> reInserted r == True)
364 mergeData :: Map Hash ReturnId
366 -> [Indexed NodeId a]
367 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
369 toDocumentWithId (sha,hpd) =
370 Indexed <$> fmap reId (lookup sha rs)
373 ------------------------------------------------------------------------
374 ------------------------------------------------------------------------
375 ------------------------------------------------------------------------
376 documentIdWithNgrams :: HasNodeError err
378 -> Cmd err (HashMap b (Map NgramsType Int)))
379 -> [Indexed NodeId a]
380 -> Cmd err [DocumentIdWithNgrams a b]
381 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
383 toDocumentIdWithNgrams d = do
385 pure $ DocumentIdWithNgrams d e
388 -- | TODO check optimization
389 mapNodeIdNgrams :: (Ord b, Hashable b)
390 => [DocumentIdWithNgrams a b]
395 mapNodeIdNgrams = HashMap.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
397 f :: DocumentIdWithNgrams a b
398 -> HashMap b (Map NgramsType (Map NodeId Int))
399 f d = fmap (fmap (Map.singleton nId)) $ documentNgrams d
401 nId = _index $ documentWithId d
404 ------------------------------------------------------------------------
405 instance ExtractNgramsT HyperdataContact
407 extractNgramsT l hc = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extract l hc
409 extract :: TermType Lang -> HyperdataContact
410 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
412 let authors = map text2ngrams
413 $ maybe ["Nothing"] (\a -> [a])
414 $ view (hc_who . _Just . cw_lastName) hc'
416 pure $ HashMap.fromList $ [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ]
419 instance ExtractNgramsT HyperdataDocument
421 extractNgramsT :: TermType Lang
423 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
424 extractNgramsT lang hd = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extractNgramsT' lang hd
426 extractNgramsT' :: TermType Lang
428 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
429 extractNgramsT' lang' doc = do
430 let source = text2ngrams
431 $ maybe "Nothing" identity
434 institutes = map text2ngrams
435 $ maybe ["Nothing"] (map toSchoolName . (T.splitOn ", "))
438 authors = map text2ngrams
439 $ maybe ["Nothing"] (T.splitOn ", ")
442 terms' <- map (enrichedTerms (lang' ^. tt_lang) CoreNLP NP)
444 <$> liftBase (extractTerms lang' $ hasText doc)
446 pure $ HashMap.fromList
447 $ [(SimpleNgrams source, Map.singleton Sources 1) ]
448 <> [(SimpleNgrams i', Map.singleton Institutes 1) | i' <- institutes ]
449 <> [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ]
450 <> [(EnrichedNgrams t', Map.singleton NgramsTerms 1) | t' <- terms' ]
452 instance (ExtractNgramsT a, HasText a) => ExtractNgramsT (Node a)
454 extractNgramsT l (Node _ _ _ _ _ _ _ h) = extractNgramsT l h
456 instance HasText a => HasText (Node a)
458 hasText (Node _ _ _ _ _ _ _ h) = hasText h
462 -- | TODO putelsewhere
463 -- | Upgrade function
464 -- Suppose all documents are English (this is the case actually)
465 indexAllDocumentsWithPosTag :: FlowCmdM env err m => m ()
466 indexAllDocumentsWithPosTag = do
467 rootId <- getRootId (UserName userMaster)
468 corpusIds <- findNodesId rootId [NodeCorpus]
469 docs <- List.concat <$> mapM getDocumentsWithParentId corpusIds
471 _ <- mapM extractInsert (splitEvery 1000 docs)
475 extractInsert :: FlowCmdM env err m => [Node HyperdataDocument] -> m ()
476 extractInsert docs = do
477 let documentsWithId = map (\doc -> Indexed (doc ^. node_id) doc) docs
479 mapNgramsDocs' <- mapNodeIdNgrams
480 <$> documentIdWithNgrams
481 (extractNgramsT $ withLang (Multi EN) documentsWithId)
484 _ <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'