2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
20 {-# LANGUAGE ConstrainedClassMethods #-}
21 {-# LANGUAGE ConstraintKinds #-}
22 {-# LANGUAGE InstanceSigs #-}
23 {-# LANGUAGE ScopedTypeVariables #-}
24 {-# LANGUAGE TemplateHaskell #-}
26 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
38 , getOrMk_RootWithCorpus
47 import Control.Lens ((^.), view, _Just, makeLenses)
48 import Data.Aeson.TH (deriveJSON)
50 import Data.HashMap.Strict (HashMap)
51 import Data.Hashable (Hashable)
52 import Data.List (concat)
53 import Data.Map (Map, lookup)
54 import Data.Maybe (catMaybes)
57 import Data.Text (splitOn)
58 import Data.Traversable (traverse)
59 import Data.Tuple.Extra (first, second)
60 import GHC.Generics (Generic)
61 import System.FilePath (FilePath)
62 import qualified Data.HashMap.Strict as HashMap
63 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
64 import qualified Data.Map as Map
66 import Gargantext.Core (Lang(..), PosTagAlgo(..))
67 import Gargantext.Core.Ext.IMT (toSchoolName)
68 import Gargantext.Core.Ext.IMTUser (deserialiseImtUsersFromFile)
69 import Gargantext.Core.Flow.Types
70 import Gargantext.Core.Text
71 import Gargantext.Core.Text.List.Group.WithStem (StopSize(..), GroupParams(..))
72 import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat)
73 import Gargantext.Core.Text.List (buildNgramsLists)
74 import Gargantext.Core.Text.Terms
75 import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
76 import Gargantext.Core.Types (POS(NP))
77 import Gargantext.Core.Types.Individu (User(..))
78 import Gargantext.Core.Types.Main
79 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
80 import Gargantext.Database.Action.Flow.List
81 import Gargantext.Database.Action.Flow.Types
82 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams, DocumentIdWithNgrams(..))
83 import Gargantext.Database.Action.Search (searchDocInDatabase)
84 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
85 import Gargantext.Database.Admin.Types.Hyperdata
86 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
87 import Gargantext.Database.Prelude
88 import Gargantext.Database.Query.Table.Ngrams
89 import Gargantext.Database.Query.Table.Node
90 import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
91 import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
92 import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
93 import Gargantext.Database.Query.Table.NodeNodeNgrams2
94 import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
95 import Gargantext.Database.Schema.Node (NodePoly(..))
96 import Gargantext.Database.Types
97 import Gargantext.Prelude
98 import Gargantext.Prelude.Crypto.Hash (Hash)
99 import qualified Gargantext.Core.Text.Corpus.API as API
100 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
102 ------------------------------------------------------------------------
103 -- TODO use internal with API name (could be old data)
104 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
105 | ExternalOrigin { _do_api :: API.ExternalAPIs }
107 deriving (Generic, Eq)
109 makeLenses ''DataOrigin
110 deriveJSON (unPrefix "_do_") ''DataOrigin
111 instance ToSchema DataOrigin where
112 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
114 allDataOrigins :: [DataOrigin]
115 allDataOrigins = map InternalOrigin API.externalAPIs
116 <> map ExternalOrigin API.externalAPIs
119 data DataText = DataOld ![NodeId]
120 | DataNew ![[HyperdataDocument]]
122 -- TODO use the split parameter in config file
123 getDataText :: FlowCmdM env err m
129 getDataText (ExternalOrigin api) la q li = liftBase $ DataNew
131 <$> API.get api (_tt_lang la) q li
133 getDataText (InternalOrigin _) _la q _li = do
134 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
135 (UserName userMaster)
137 (Nothing :: Maybe HyperdataCorpus)
138 ids <- map fst <$> searchDocInDatabase cId (stemIt q)
141 -------------------------------------------------------------------------------
142 flowDataText :: ( FlowCmdM env err m
149 flowDataText u (DataOld ids) tt cid = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids
151 corpusType = (Nothing :: Maybe HyperdataCorpus)
152 flowDataText u (DataNew txt) tt cid = flowCorpus u (Right [cid]) tt txt
154 ------------------------------------------------------------------------
156 flowAnnuaire :: (FlowCmdM env err m)
158 -> Either CorpusName [CorpusId]
162 flowAnnuaire u n l filePath = do
163 docs <- liftBase $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
164 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
166 ------------------------------------------------------------------------
167 flowCorpusFile :: (FlowCmdM env err m)
169 -> Either CorpusName [CorpusId]
170 -> Limit -- Limit the number of docs (for dev purpose)
171 -> TermType Lang -> FileFormat -> FilePath
173 flowCorpusFile u n l la ff fp = do
174 docs <- liftBase ( splitEvery 500
178 flowCorpus u n la (map (map toHyperdataDocument) docs)
180 ------------------------------------------------------------------------
181 -- | TODO improve the needed type to create/update a corpus
182 -- (For now, Either is enough)
183 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
185 -> Either CorpusName [CorpusId]
189 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
192 flow :: ( FlowCmdM env err m
198 -> Either CorpusName [CorpusId]
202 flow c u cn la docs = do
203 -- TODO if public insertMasterDocs else insertUserDocs
204 ids <- traverse (insertMasterDocs c la) docs
205 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
207 ------------------------------------------------------------------------
208 flowCorpusUser :: ( FlowCmdM env err m
213 -> Either CorpusName [CorpusId]
217 flowCorpusUser l user corpusName ctype ids = do
219 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
220 -- NodeTexts is first
221 _tId <- insertDefaultNode NodeTexts userCorpusId userId
222 -- printDebug "NodeTexts: " tId
224 -- NodeList is second
225 listId <- getOrMkList userCorpusId userId
226 -- _cooc <- insertDefaultNode NodeListCooc listId userId
227 -- TODO: check if present already, ignore
228 _ <- Doc.add userCorpusId ids
230 -- printDebug "Node Text Ids:" tId
233 (masterUserId, _masterRootId, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
234 ngs <- buildNgramsLists user (GroupParams l 2 3 (StopSize 3)) userCorpusId masterCorpusId
235 _userListId <- flowList_DbRepo listId ngs
236 _mastListId <- getOrMkList masterCorpusId masterUserId
237 -- _ <- insertOccsUpdates userCorpusId mastListId
238 -- printDebug "userListId" userListId
240 _ <- insertDefaultNode NodeDashboard userCorpusId userId
241 _ <- insertDefaultNode NodeGraph userCorpusId userId
242 --_ <- mkPhylo userCorpusId userId
244 -- _ <- mkAnnuaire rootUserId userId
248 insertMasterDocs :: ( FlowCmdM env err m
256 insertMasterDocs c lang hs = do
257 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
258 (ids', documentsWithId) <- insertDocs masterUserId masterCorpusId (map (toNode masterUserId masterCorpusId) hs )
259 _ <- Doc.add masterCorpusId ids'
261 -- create a corpus with database name (CSV or PubMed)
262 -- add documents to the corpus (create node_node link)
263 -- this will enable global database monitoring
265 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
266 mapNgramsDocs' :: HashMap ExtractedNgrams (Map NgramsType (Map NodeId Int))
268 <$> documentIdWithNgrams
269 (extractNgramsT $ withLang lang documentsWithId)
272 terms2id <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
273 let mapNgramsDocs = HashMap.mapKeys extracted2ngrams mapNgramsDocs'
276 let indexedNgrams = HashMap.mapKeys (indexNgrams terms2id) mapNgramsDocs
279 lId <- getOrMkList masterCorpusId masterUserId
280 mapCgramsId <- listInsertDb lId toNodeNgramsW'
281 $ map (first _ngramsTerms . second Map.keys)
282 $ HashMap.toList mapNgramsDocs
284 _return <- insertNodeNodeNgrams2
285 $ catMaybes [ NodeNodeNgrams2 <$> Just nId
286 <*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms'')
287 <*> Just (fromIntegral w :: Double)
288 | (terms'', mapNgramsTypes) <- HashMap.toList mapNgramsDocs
289 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
290 , (nId, w) <- Map.toList mapNodeIdWeight
293 -- _cooc <- insertDefaultNode NodeListCooc lId masterUserId
295 _ <- insertDocNgrams lId indexedNgrams
298 ------------------------------------------------------------------------
299 -- TODO Type NodeDocumentUnicised
300 insertDocs :: ( FlowCmdM env err m
307 -> m ([DocId], [Indexed NodeId a])
308 insertDocs uId cId hs = do
309 let docs = map addUniqId hs
310 newIds <- insertDb uId cId docs
311 -- printDebug "newIds" newIds
313 newIds' = map reId newIds
314 documentsWithId = mergeData (toInserted newIds) (Map.fromList $ map viewUniqId' docs)
315 _ <- Doc.add cId newIds'
316 pure (newIds', documentsWithId)
319 ------------------------------------------------------------------------
320 viewUniqId' :: UniqId a
323 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
325 err = panic "[ERROR] Database.Flow.toInsert"
328 toInserted :: [ReturnId]
331 Map.fromList . map (\r -> (reUniqId r, r) )
332 . filter (\r -> reInserted r == True)
334 mergeData :: Map Hash ReturnId
336 -> [Indexed NodeId a]
337 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
339 toDocumentWithId (sha,hpd) =
340 Indexed <$> fmap reId (lookup sha rs)
343 ------------------------------------------------------------------------
344 ------------------------------------------------------------------------
345 ------------------------------------------------------------------------
346 documentIdWithNgrams :: HasNodeError err
348 -> Cmd err (HashMap b (Map NgramsType Int)))
349 -> [Indexed NodeId a]
350 -> Cmd err [DocumentIdWithNgrams a b]
351 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
353 toDocumentIdWithNgrams d = do
355 pure $ DocumentIdWithNgrams d e
358 -- | TODO check optimization
359 mapNodeIdNgrams :: (Ord b, Hashable b)
360 => [DocumentIdWithNgrams a b]
365 mapNodeIdNgrams = HashMap.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
367 f :: DocumentIdWithNgrams a b
368 -> HashMap b (Map NgramsType (Map NodeId Int))
369 f d = fmap (fmap (Map.singleton nId)) $ documentNgrams d
371 nId = _index $ documentWithId d
374 ------------------------------------------------------------------------
375 instance ExtractNgramsT HyperdataContact
377 extractNgramsT l hc = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extract l hc
379 extract :: TermType Lang -> HyperdataContact
380 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
382 let authors = map text2ngrams
383 $ maybe ["Nothing"] (\a -> [a])
384 $ view (hc_who . _Just . cw_lastName) hc'
386 pure $ HashMap.fromList $ [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ]
389 instance ExtractNgramsT HyperdataDocument
391 extractNgramsT :: TermType Lang
393 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
394 extractNgramsT lang hd = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extractNgramsT' lang hd
396 extractNgramsT' :: TermType Lang
398 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
399 extractNgramsT' lang' doc = do
400 let source = text2ngrams
401 $ maybe "Nothing" identity
404 institutes = map text2ngrams
405 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
408 authors = map text2ngrams
409 $ maybe ["Nothing"] (splitOn ", ")
412 terms' <- map (enrichedTerms (lang' ^. tt_lang) CoreNLP NP)
414 <$> liftBase (extractTerms lang' $ hasText doc)
416 pure $ HashMap.fromList
417 $ [(SimpleNgrams source, Map.singleton Sources 1) ]
418 <> [(SimpleNgrams i', Map.singleton Institutes 1) | i' <- institutes ]
419 <> [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ]
420 <> [(EnrichedNgrams t', Map.singleton NgramsTerms 1) | t' <- terms' ]
422 instance (ExtractNgramsT a, HasText a) => ExtractNgramsT (Node a)
424 extractNgramsT l (Node _ _ _ _ _ _ _ h) = extractNgramsT l h
426 instance HasText a => HasText (Node a)
428 hasText (Node _ _ _ _ _ _ _ h) = hasText h