2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
20 {-# LANGUAGE ConstrainedClassMethods #-}
21 {-# LANGUAGE ConstraintKinds #-}
22 {-# LANGUAGE InstanceSigs #-}
23 {-# LANGUAGE ScopedTypeVariables #-}
24 {-# LANGUAGE TemplateHaskell #-}
26 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
39 , getOrMk_RootWithCorpus
45 , indexAllDocumentsWithPosTag
49 import Control.Lens ((^.), view, _Just, makeLenses)
50 import Data.Aeson.TH (deriveJSON)
52 import Data.HashMap.Strict (HashMap)
53 import Data.Hashable (Hashable)
54 import Data.List (concat)
55 import Data.Map (Map, lookup)
56 import Data.Maybe (catMaybes)
59 import qualified Data.Text as T
60 import Data.Traversable (traverse)
61 import Data.Tuple.Extra (first, second)
62 import GHC.Generics (Generic)
63 import System.FilePath (FilePath)
64 import qualified Data.HashMap.Strict as HashMap
65 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
66 import qualified Data.Map as Map
68 import Gargantext.Core (Lang(..), PosTagAlgo(..))
69 import Gargantext.Core.Ext.IMT (toSchoolName)
70 import Gargantext.Core.Ext.IMTUser (readFile_Annuaire)
71 import Gargantext.Core.Flow.Types
72 import Gargantext.Core.Text
73 import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
74 import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat)
75 import Gargantext.Core.Text.List (buildNgramsLists)
76 import Gargantext.Core.Text.Terms
77 import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
78 import Gargantext.Core.Types (POS(NP))
79 import Gargantext.Core.Types.Individu (User(..))
80 import Gargantext.Core.Types.Main
81 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
82 import Gargantext.Database.Action.Flow.List
83 import Gargantext.Database.Action.Flow.Types
84 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams, DocumentIdWithNgrams(..))
85 import Gargantext.Database.Action.Search (searchDocInDatabase)
86 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
87 import Gargantext.Database.Admin.Types.Hyperdata
88 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
89 import Gargantext.Database.Prelude
90 import Gargantext.Database.Query.Table.Ngrams
91 import Gargantext.Database.Query.Table.Node
92 import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
93 import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
94 import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
95 import Gargantext.Database.Query.Table.NodeNodeNgrams2
96 import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
97 import Gargantext.Database.Schema.Node (NodePoly(..), node_id)
98 import Gargantext.Database.Types
99 import Gargantext.Prelude
100 import Gargantext.Prelude.Crypto.Hash (Hash)
101 import qualified Gargantext.Core.Text.Corpus.API as API
102 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
104 ------------------------------------------------------------------------
105 -- Impots for upgrade function
106 import Gargantext.Database.Query.Tree.Root (getRootId)
107 import Gargantext.Database.Query.Tree (findNodesId)
108 import qualified Data.List as List
109 ------------------------------------------------------------------------
110 -- TODO use internal with API name (could be old data)
111 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
112 | ExternalOrigin { _do_api :: API.ExternalAPIs }
114 deriving (Generic, Eq)
116 makeLenses ''DataOrigin
117 deriveJSON (unPrefix "_do_") ''DataOrigin
118 instance ToSchema DataOrigin where
119 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
121 allDataOrigins :: [DataOrigin]
122 allDataOrigins = map InternalOrigin API.externalAPIs
123 <> map ExternalOrigin API.externalAPIs
126 data DataText = DataOld ![NodeId]
127 | DataNew ![[HyperdataDocument]]
129 -- TODO use the split parameter in config file
130 getDataText :: FlowCmdM env err m
136 getDataText (ExternalOrigin api) la q li = liftBase $ DataNew
138 <$> API.get api (_tt_lang la) q li
140 getDataText (InternalOrigin _) _la q _li = do
141 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
142 (UserName userMaster)
144 (Nothing :: Maybe HyperdataCorpus)
145 ids <- map fst <$> searchDocInDatabase cId (stemIt q)
148 -------------------------------------------------------------------------------
149 flowDataText :: ( FlowCmdM env err m
156 flowDataText u (DataOld ids) tt cid = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids
158 corpusType = (Nothing :: Maybe HyperdataCorpus)
159 flowDataText u (DataNew txt) tt cid = flowCorpus u (Right [cid]) tt txt
161 ------------------------------------------------------------------------
163 flowAnnuaire :: (FlowCmdM env err m)
165 -> Either CorpusName [CorpusId]
169 flowAnnuaire u n l filePath = do
170 docs <- liftBase $ (( splitEvery 500 <$> readFile_Annuaire filePath) :: IO [[HyperdataContact]])
171 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
173 ------------------------------------------------------------------------
174 flowCorpusFile :: (FlowCmdM env err m)
176 -> Either CorpusName [CorpusId]
177 -> Limit -- Limit the number of docs (for dev purpose)
178 -> TermType Lang -> FileFormat -> FilePath
180 flowCorpusFile u n l la ff fp = do
181 eParsed <- liftBase $ parseFile ff fp
184 let docs = splitEvery 500 $ take l parsed
185 flowCorpus u n la (map (map toHyperdataDocument) docs)
186 Left e -> panic $ "Error: " <> (T.pack e)
188 ------------------------------------------------------------------------
189 -- | TODO improve the needed type to create/update a corpus
190 -- (For now, Either is enough)
191 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
193 -> Either CorpusName [CorpusId]
197 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
200 flow :: ( FlowCmdM env err m
206 -> Either CorpusName [CorpusId]
210 flow c u cn la docs = do
211 -- TODO if public insertMasterDocs else insertUserDocs
212 ids <- traverse (insertMasterDocs c la) docs
213 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
215 ------------------------------------------------------------------------
216 flowCorpusUser :: ( FlowCmdM env err m
221 -> Either CorpusName [CorpusId]
225 flowCorpusUser l user corpusName ctype ids = do
227 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
228 -- NodeTexts is first
229 _tId <- insertDefaultNode NodeTexts userCorpusId userId
230 -- printDebug "NodeTexts: " tId
232 -- NodeList is second
233 listId <- getOrMkList userCorpusId userId
234 -- _cooc <- insertDefaultNode NodeListCooc listId userId
235 -- TODO: check if present already, ignore
236 _ <- Doc.add userCorpusId ids
238 -- printDebug "Node Text Ids:" tId
241 (masterUserId, _masterRootId, masterCorpusId)
242 <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
244 --let gp = (GroupParams l 2 3 (StopSize 3))
245 let gp = GroupWithPosTag l CoreNLP HashMap.empty
246 ngs <- buildNgramsLists gp user userCorpusId masterCorpusId
248 _userListId <- flowList_DbRepo listId ngs
249 _mastListId <- getOrMkList masterCorpusId masterUserId
250 -- _ <- insertOccsUpdates userCorpusId mastListId
251 -- printDebug "userListId" userListId
253 _ <- insertDefaultNode NodeDashboard userCorpusId userId
254 _ <- insertDefaultNode NodeGraph userCorpusId userId
255 --_ <- mkPhylo userCorpusId userId
257 -- _ <- mkAnnuaire rootUserId userId
261 insertMasterDocs :: ( FlowCmdM env err m
269 insertMasterDocs c lang hs = do
270 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
271 (ids', documentsWithId) <- insertDocs masterUserId masterCorpusId (map (toNode masterUserId masterCorpusId) hs )
272 _ <- Doc.add masterCorpusId ids'
274 -- create a corpus with database name (CSV or PubMed)
275 -- add documents to the corpus (create node_node link)
276 -- this will enable global database monitoring
278 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
279 mapNgramsDocs' :: HashMap ExtractedNgrams (Map NgramsType (Map NodeId Int))
281 <$> documentIdWithNgrams
282 (extractNgramsT $ withLang lang documentsWithId)
285 lId <- getOrMkList masterCorpusId masterUserId
286 _ <- saveDocNgramsWith lId mapNgramsDocs'
288 -- _cooc <- insertDefaultNode NodeListCooc lId masterUserId
291 saveDocNgramsWith :: ( FlowCmdM env err m)
293 -> HashMap ExtractedNgrams (Map NgramsType (Map NodeId Int))
295 saveDocNgramsWith lId mapNgramsDocs' = do
296 terms2id <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
297 let mapNgramsDocs = HashMap.mapKeys extracted2ngrams mapNgramsDocs'
300 let indexedNgrams = HashMap.mapKeys (indexNgrams terms2id) mapNgramsDocs
303 mapCgramsId <- listInsertDb lId toNodeNgramsW'
304 $ map (first _ngramsTerms . second Map.keys)
305 $ HashMap.toList mapNgramsDocs
308 _return <- insertNodeNodeNgrams2
309 $ catMaybes [ NodeNodeNgrams2 <$> Just nId
310 <*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms'')
311 <*> Just (fromIntegral w :: Double)
312 | (terms'', mapNgramsTypes) <- HashMap.toList mapNgramsDocs
313 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
314 , (nId, w) <- Map.toList mapNodeIdWeight
317 _ <- insertDocNgrams lId indexedNgrams
322 ------------------------------------------------------------------------
323 -- TODO Type NodeDocumentUnicised
324 insertDocs :: ( FlowCmdM env err m
331 -> m ([DocId], [Indexed NodeId a])
332 insertDocs uId cId hs = do
333 let docs = map addUniqId hs
334 newIds <- insertDb uId cId docs
335 -- printDebug "newIds" newIds
337 newIds' = map reId newIds
338 documentsWithId = mergeData (toInserted newIds) (Map.fromList $ map viewUniqId' docs)
339 _ <- Doc.add cId newIds'
340 pure (newIds', documentsWithId)
343 ------------------------------------------------------------------------
344 viewUniqId' :: UniqId a
347 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
349 err = panic "[ERROR] Database.Flow.toInsert"
352 toInserted :: [ReturnId]
355 Map.fromList . map (\r -> (reUniqId r, r) )
356 . filter (\r -> reInserted r == True)
358 mergeData :: Map Hash ReturnId
360 -> [Indexed NodeId a]
361 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
363 toDocumentWithId (sha,hpd) =
364 Indexed <$> fmap reId (lookup sha rs)
367 ------------------------------------------------------------------------
368 ------------------------------------------------------------------------
369 ------------------------------------------------------------------------
370 documentIdWithNgrams :: HasNodeError err
372 -> Cmd err (HashMap b (Map NgramsType Int)))
373 -> [Indexed NodeId a]
374 -> Cmd err [DocumentIdWithNgrams a b]
375 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
377 toDocumentIdWithNgrams d = do
379 pure $ DocumentIdWithNgrams d e
382 -- | TODO check optimization
383 mapNodeIdNgrams :: (Ord b, Hashable b)
384 => [DocumentIdWithNgrams a b]
389 mapNodeIdNgrams = HashMap.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
391 f :: DocumentIdWithNgrams a b
392 -> HashMap b (Map NgramsType (Map NodeId Int))
393 f d = fmap (fmap (Map.singleton nId)) $ documentNgrams d
395 nId = _index $ documentWithId d
398 ------------------------------------------------------------------------
399 instance ExtractNgramsT HyperdataContact
401 extractNgramsT l hc = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extract l hc
403 extract :: TermType Lang -> HyperdataContact
404 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
406 let authors = map text2ngrams
407 $ maybe ["Nothing"] (\a -> [a])
408 $ view (hc_who . _Just . cw_lastName) hc'
410 pure $ HashMap.fromList $ [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ]
413 instance ExtractNgramsT HyperdataDocument
415 extractNgramsT :: TermType Lang
417 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
418 extractNgramsT lang hd = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extractNgramsT' lang hd
420 extractNgramsT' :: TermType Lang
422 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
423 extractNgramsT' lang' doc = do
424 let source = text2ngrams
425 $ maybe "Nothing" identity
428 institutes = map text2ngrams
429 $ maybe ["Nothing"] (map toSchoolName . (T.splitOn ", "))
432 authors = map text2ngrams
433 $ maybe ["Nothing"] (T.splitOn ", ")
436 terms' <- map (enrichedTerms (lang' ^. tt_lang) CoreNLP NP)
438 <$> liftBase (extractTerms lang' $ hasText doc)
440 pure $ HashMap.fromList
441 $ [(SimpleNgrams source, Map.singleton Sources 1) ]
442 <> [(SimpleNgrams i', Map.singleton Institutes 1) | i' <- institutes ]
443 <> [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ]
444 <> [(EnrichedNgrams t', Map.singleton NgramsTerms 1) | t' <- terms' ]
446 instance (ExtractNgramsT a, HasText a) => ExtractNgramsT (Node a)
448 extractNgramsT l (Node _ _ _ _ _ _ _ h) = extractNgramsT l h
450 instance HasText a => HasText (Node a)
452 hasText (Node _ _ _ _ _ _ _ h) = hasText h
456 -- | TODO putelsewhere
457 -- | Upgrade function
458 -- Suppose all documents are English (this is the case actually)
459 indexAllDocumentsWithPosTag :: FlowCmdM env err m => m ()
460 indexAllDocumentsWithPosTag = do
461 rootId <- getRootId (UserName userMaster)
462 corpusIds <- findNodesId rootId [NodeCorpus]
463 docs <- List.concat <$> mapM getDocumentsWithParentId corpusIds
465 _ <- mapM extractInsert (splitEvery 1000 docs)
469 extractInsert :: FlowCmdM env err m => [Node HyperdataDocument] -> m ()
470 extractInsert docs = do
471 let documentsWithId = map (\doc -> Indexed (doc ^. node_id) doc) docs
473 mapNgramsDocs' <- mapNodeIdNgrams
474 <$> documentIdWithNgrams
475 (extractNgramsT $ withLang (Multi EN) documentsWithId)
478 _ <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'