2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
20 {-# LANGUAGE ConstrainedClassMethods #-}
21 {-# LANGUAGE ConstraintKinds #-}
22 {-# LANGUAGE InstanceSigs #-}
23 {-# LANGUAGE ScopedTypeVariables #-}
24 {-# LANGUAGE TemplateHaskell #-}
26 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
38 , getOrMk_RootWithCorpus
47 import Control.Lens ((^.), view, _Just, makeLenses)
48 import Data.Aeson.TH (deriveJSON)
50 import Data.HashMap.Strict (HashMap)
51 import Data.Hashable (Hashable)
52 import Data.List (concat)
53 import Data.Map (Map, lookup)
54 import Data.Maybe (catMaybes)
57 import Data.Text (splitOn)
58 import Data.Traversable (traverse)
59 import Data.Tuple.Extra (first, second)
60 import GHC.Generics (Generic)
61 import System.FilePath (FilePath)
62 import qualified Data.HashMap.Strict as HashMap
63 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
64 import qualified Data.Map as Map
66 import Gargantext.Core (Lang(..), PosTagAlgo(..))
67 import Gargantext.Core.Ext.IMT (toSchoolName)
68 import Gargantext.Core.Ext.IMTUser (deserialiseImtUsersFromFile)
69 import Gargantext.Core.Flow.Types
70 import Gargantext.Core.Text
71 import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
72 import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat)
73 import Gargantext.Core.Text.List (buildNgramsLists)
74 import Gargantext.Core.Text.Terms
75 import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
76 import Gargantext.Core.Types (POS(NP))
77 import Gargantext.Core.Types.Individu (User(..))
78 import Gargantext.Core.Types.Main
79 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
80 import Gargantext.Database.Action.Flow.List
81 import Gargantext.Database.Action.Flow.Types
82 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams, DocumentIdWithNgrams(..))
83 import Gargantext.Database.Action.Search (searchDocInDatabase)
84 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
85 import Gargantext.Database.Admin.Types.Hyperdata
86 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
87 import Gargantext.Database.Prelude
88 import Gargantext.Database.Query.Table.Ngrams
89 import Gargantext.Database.Query.Table.Node
90 import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
91 import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
92 import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
93 import Gargantext.Database.Query.Table.NodeNodeNgrams2
94 import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
95 import Gargantext.Database.Schema.Node (NodePoly(..))
96 import Gargantext.Database.Types
97 import Gargantext.Prelude
98 import Gargantext.Prelude.Crypto.Hash (Hash)
99 import qualified Gargantext.Core.Text.Corpus.API as API
100 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
102 ------------------------------------------------------------------------
103 -- TODO use internal with API name (could be old data)
104 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
105 | ExternalOrigin { _do_api :: API.ExternalAPIs }
107 deriving (Generic, Eq)
109 makeLenses ''DataOrigin
110 deriveJSON (unPrefix "_do_") ''DataOrigin
111 instance ToSchema DataOrigin where
112 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
114 allDataOrigins :: [DataOrigin]
115 allDataOrigins = map InternalOrigin API.externalAPIs
116 <> map ExternalOrigin API.externalAPIs
119 data DataText = DataOld ![NodeId]
120 | DataNew ![[HyperdataDocument]]
122 -- TODO use the split parameter in config file
123 getDataText :: FlowCmdM env err m
129 getDataText (ExternalOrigin api) la q li = liftBase $ DataNew
131 <$> API.get api (_tt_lang la) q li
133 getDataText (InternalOrigin _) _la q _li = do
134 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
135 (UserName userMaster)
137 (Nothing :: Maybe HyperdataCorpus)
138 ids <- map fst <$> searchDocInDatabase cId (stemIt q)
141 -------------------------------------------------------------------------------
142 flowDataText :: ( FlowCmdM env err m
149 flowDataText u (DataOld ids) tt cid = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids
151 corpusType = (Nothing :: Maybe HyperdataCorpus)
152 flowDataText u (DataNew txt) tt cid = flowCorpus u (Right [cid]) tt txt
154 ------------------------------------------------------------------------
156 flowAnnuaire :: (FlowCmdM env err m)
158 -> Either CorpusName [CorpusId]
162 flowAnnuaire u n l filePath = do
163 docs <- liftBase $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
164 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
166 ------------------------------------------------------------------------
167 flowCorpusFile :: (FlowCmdM env err m)
169 -> Either CorpusName [CorpusId]
170 -> Limit -- Limit the number of docs (for dev purpose)
171 -> TermType Lang -> FileFormat -> FilePath
173 flowCorpusFile u n l la ff fp = do
174 docs <- liftBase ( splitEvery 500
178 flowCorpus u n la (map (map toHyperdataDocument) docs)
180 ------------------------------------------------------------------------
181 -- | TODO improve the needed type to create/update a corpus
182 -- (For now, Either is enough)
183 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
185 -> Either CorpusName [CorpusId]
189 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
192 flow :: ( FlowCmdM env err m
198 -> Either CorpusName [CorpusId]
202 flow c u cn la docs = do
203 -- TODO if public insertMasterDocs else insertUserDocs
204 ids <- traverse (insertMasterDocs c la) docs
205 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
207 ------------------------------------------------------------------------
208 flowCorpusUser :: ( FlowCmdM env err m
213 -> Either CorpusName [CorpusId]
217 flowCorpusUser l user corpusName ctype ids = do
219 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
220 -- NodeTexts is first
221 _tId <- insertDefaultNode NodeTexts userCorpusId userId
222 -- printDebug "NodeTexts: " tId
224 -- NodeList is second
225 listId <- getOrMkList userCorpusId userId
226 -- _cooc <- insertDefaultNode NodeListCooc listId userId
227 -- TODO: check if present already, ignore
228 _ <- Doc.add userCorpusId ids
230 -- printDebug "Node Text Ids:" tId
233 (masterUserId, _masterRootId, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
235 -- let gp = (GroupParams l 2 3 (StopSize 3))
236 let gp = GroupWithPosTag l CoreNLP HashMap.empty
237 ngs <- buildNgramsLists gp user userCorpusId masterCorpusId
239 _userListId <- flowList_DbRepo listId ngs
240 _mastListId <- getOrMkList masterCorpusId masterUserId
241 -- _ <- insertOccsUpdates userCorpusId mastListId
242 -- printDebug "userListId" userListId
244 _ <- insertDefaultNode NodeDashboard userCorpusId userId
245 _ <- insertDefaultNode NodeGraph userCorpusId userId
246 --_ <- mkPhylo userCorpusId userId
248 -- _ <- mkAnnuaire rootUserId userId
252 insertMasterDocs :: ( FlowCmdM env err m
260 insertMasterDocs c lang hs = do
261 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
262 (ids', documentsWithId) <- insertDocs masterUserId masterCorpusId (map (toNode masterUserId masterCorpusId) hs )
263 _ <- Doc.add masterCorpusId ids'
265 -- create a corpus with database name (CSV or PubMed)
266 -- add documents to the corpus (create node_node link)
267 -- this will enable global database monitoring
269 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
270 mapNgramsDocs' :: HashMap ExtractedNgrams (Map NgramsType (Map NodeId Int))
272 <$> documentIdWithNgrams
273 (extractNgramsT $ withLang lang documentsWithId)
276 terms2id <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
277 let mapNgramsDocs = HashMap.mapKeys extracted2ngrams mapNgramsDocs'
280 let indexedNgrams = HashMap.mapKeys (indexNgrams terms2id) mapNgramsDocs
283 lId <- getOrMkList masterCorpusId masterUserId
284 mapCgramsId <- listInsertDb lId toNodeNgramsW'
285 $ map (first _ngramsTerms . second Map.keys)
286 $ HashMap.toList mapNgramsDocs
288 _return <- insertNodeNodeNgrams2
289 $ catMaybes [ NodeNodeNgrams2 <$> Just nId
290 <*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms'')
291 <*> Just (fromIntegral w :: Double)
292 | (terms'', mapNgramsTypes) <- HashMap.toList mapNgramsDocs
293 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
294 , (nId, w) <- Map.toList mapNodeIdWeight
297 -- _cooc <- insertDefaultNode NodeListCooc lId masterUserId
299 _ <- insertDocNgrams lId indexedNgrams
302 ------------------------------------------------------------------------
303 -- TODO Type NodeDocumentUnicised
304 insertDocs :: ( FlowCmdM env err m
311 -> m ([DocId], [Indexed NodeId a])
312 insertDocs uId cId hs = do
313 let docs = map addUniqId hs
314 newIds <- insertDb uId cId docs
315 -- printDebug "newIds" newIds
317 newIds' = map reId newIds
318 documentsWithId = mergeData (toInserted newIds) (Map.fromList $ map viewUniqId' docs)
319 _ <- Doc.add cId newIds'
320 pure (newIds', documentsWithId)
323 ------------------------------------------------------------------------
324 viewUniqId' :: UniqId a
327 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
329 err = panic "[ERROR] Database.Flow.toInsert"
332 toInserted :: [ReturnId]
335 Map.fromList . map (\r -> (reUniqId r, r) )
336 . filter (\r -> reInserted r == True)
338 mergeData :: Map Hash ReturnId
340 -> [Indexed NodeId a]
341 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
343 toDocumentWithId (sha,hpd) =
344 Indexed <$> fmap reId (lookup sha rs)
347 ------------------------------------------------------------------------
348 ------------------------------------------------------------------------
349 ------------------------------------------------------------------------
350 documentIdWithNgrams :: HasNodeError err
352 -> Cmd err (HashMap b (Map NgramsType Int)))
353 -> [Indexed NodeId a]
354 -> Cmd err [DocumentIdWithNgrams a b]
355 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
357 toDocumentIdWithNgrams d = do
359 pure $ DocumentIdWithNgrams d e
362 -- | TODO check optimization
363 mapNodeIdNgrams :: (Ord b, Hashable b)
364 => [DocumentIdWithNgrams a b]
369 mapNodeIdNgrams = HashMap.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
371 f :: DocumentIdWithNgrams a b
372 -> HashMap b (Map NgramsType (Map NodeId Int))
373 f d = fmap (fmap (Map.singleton nId)) $ documentNgrams d
375 nId = _index $ documentWithId d
378 ------------------------------------------------------------------------
379 instance ExtractNgramsT HyperdataContact
381 extractNgramsT l hc = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extract l hc
383 extract :: TermType Lang -> HyperdataContact
384 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
386 let authors = map text2ngrams
387 $ maybe ["Nothing"] (\a -> [a])
388 $ view (hc_who . _Just . cw_lastName) hc'
390 pure $ HashMap.fromList $ [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ]
393 instance ExtractNgramsT HyperdataDocument
395 extractNgramsT :: TermType Lang
397 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
398 extractNgramsT lang hd = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extractNgramsT' lang hd
400 extractNgramsT' :: TermType Lang
402 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
403 extractNgramsT' lang' doc = do
404 let source = text2ngrams
405 $ maybe "Nothing" identity
408 institutes = map text2ngrams
409 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
412 authors = map text2ngrams
413 $ maybe ["Nothing"] (splitOn ", ")
416 terms' <- map (enrichedTerms (lang' ^. tt_lang) CoreNLP NP)
418 <$> liftBase (extractTerms lang' $ hasText doc)
420 pure $ HashMap.fromList
421 $ [(SimpleNgrams source, Map.singleton Sources 1) ]
422 <> [(SimpleNgrams i', Map.singleton Institutes 1) | i' <- institutes ]
423 <> [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ]
424 <> [(EnrichedNgrams t', Map.singleton NgramsTerms 1) | t' <- terms' ]
426 instance (ExtractNgramsT a, HasText a) => ExtractNgramsT (Node a)
428 extractNgramsT l (Node _ _ _ _ _ _ _ h) = extractNgramsT l h
430 instance HasText a => HasText (Node a)
432 hasText (Node _ _ _ _ _ _ _ h) = hasText h