2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
20 {-# LANGUAGE ConstraintKinds #-}
21 {-# LANGUAGE ConstrainedClassMethods #-}
22 {-# LANGUAGE ConstraintKinds #-}
23 {-# LANGUAGE InstanceSigs #-}
24 {-# LANGUAGE TemplateHaskell #-}
26 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
38 , getOrMk_RootWithCorpus
47 import Control.Lens ((^.), view, _Just, makeLenses)
48 import Data.Aeson.TH (deriveJSON)
50 import Data.List (concat)
51 import qualified Data.Map as Map
52 import Data.Map (Map, lookup)
53 import Data.Maybe (catMaybes)
56 import Data.Text (splitOn, intercalate)
57 import Data.Traversable (traverse)
58 import Data.Tuple.Extra (first, second)
59 import GHC.Generics (Generic)
60 import System.FilePath (FilePath)
62 import Gargantext.Core (Lang(..))
63 import Gargantext.Core.Ext.IMT (toSchoolName)
64 import Gargantext.Core.Ext.IMTUser (deserialiseImtUsersFromFile)
65 import Gargantext.Core.Flow.Types
66 import Gargantext.Core.Text
67 import Gargantext.Core.Text.List.Group.WithStem (StopSize(..), GroupParams(..))
68 import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat)
69 import Gargantext.Core.Text.List (buildNgramsLists)
70 import Gargantext.Core.Text.Terms
71 import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
72 import Gargantext.Core.Types (Terms(..))
73 import Gargantext.Core.Types.Individu (User(..))
74 import Gargantext.Core.Types.Main
75 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
76 import Gargantext.Database.Action.Flow.List
77 import Gargantext.Database.Action.Flow.Types
78 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams)
79 import Gargantext.Database.Action.Search (searchDocInDatabase)
80 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
81 import Gargantext.Database.Admin.Types.Hyperdata
82 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
83 import Gargantext.Database.Prelude
84 import Gargantext.Database.Query.Table.Ngrams
85 import Gargantext.Database.Query.Table.Node
86 import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
87 import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
88 import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
89 import Gargantext.Database.Query.Table.NodeNodeNgrams2
90 import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
91 import Gargantext.Database.Schema.Node (NodePoly(..))
92 import Gargantext.Prelude
93 import Gargantext.Prelude.Crypto.Hash (Hash)
94 import qualified Gargantext.Core.Text.Corpus.API as API
95 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
97 ------------------------------------------------------------------------
98 -- TODO use internal with API name (could be old data)
99 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
100 | ExternalOrigin { _do_api :: API.ExternalAPIs }
102 deriving (Generic, Eq)
104 makeLenses ''DataOrigin
105 deriveJSON (unPrefix "_do_") ''DataOrigin
106 instance ToSchema DataOrigin where
107 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
109 allDataOrigins :: [DataOrigin]
110 allDataOrigins = map InternalOrigin API.externalAPIs
111 <> map ExternalOrigin API.externalAPIs
114 data DataText = DataOld ![NodeId]
115 | DataNew ![[HyperdataDocument]]
117 -- TODO use the split parameter in config file
118 getDataText :: FlowCmdM env err m
124 getDataText (ExternalOrigin api) la q li = liftBase $ DataNew
126 <$> API.get api (_tt_lang la) q li
128 getDataText (InternalOrigin _) _la q _li = do
129 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
130 (UserName userMaster)
132 (Nothing :: Maybe HyperdataCorpus)
133 ids <- map fst <$> searchDocInDatabase cId (stemIt q)
136 -------------------------------------------------------------------------------
137 flowDataText :: ( FlowCmdM env err m
144 flowDataText u (DataOld ids) tt cid = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids
146 corpusType = (Nothing :: Maybe HyperdataCorpus)
147 flowDataText u (DataNew txt) tt cid = flowCorpus u (Right [cid]) tt txt
149 ------------------------------------------------------------------------
151 flowAnnuaire :: (FlowCmdM env err m)
153 -> Either CorpusName [CorpusId]
157 flowAnnuaire u n l filePath = do
158 docs <- liftBase $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
159 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
161 ------------------------------------------------------------------------
162 flowCorpusFile :: (FlowCmdM env err m)
164 -> Either CorpusName [CorpusId]
165 -> Limit -- Limit the number of docs (for dev purpose)
166 -> TermType Lang -> FileFormat -> FilePath
168 flowCorpusFile u n l la ff fp = do
169 docs <- liftBase ( splitEvery 500
173 flowCorpus u n la (map (map toHyperdataDocument) docs)
175 ------------------------------------------------------------------------
176 -- | TODO improve the needed type to create/update a corpus
177 -- (For now, Either is enough)
178 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
180 -> Either CorpusName [CorpusId]
184 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
187 flow :: ( FlowCmdM env err m
193 -> Either CorpusName [CorpusId]
197 flow c u cn la docs = do
198 -- TODO if public insertMasterDocs else insertUserDocs
199 ids <- traverse (insertMasterDocs c la) docs
200 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
202 ------------------------------------------------------------------------
203 flowCorpusUser :: ( FlowCmdM env err m
208 -> Either CorpusName [CorpusId]
212 flowCorpusUser l user corpusName ctype ids = do
214 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
215 -- NodeTexts is first
216 _tId <- insertDefaultNode NodeTexts userCorpusId userId
217 -- printDebug "NodeTexts: " tId
219 -- NodeList is second
220 listId <- getOrMkList userCorpusId userId
221 -- _cooc <- insertDefaultNode NodeListCooc listId userId
222 -- TODO: check if present already, ignore
223 _ <- Doc.add userCorpusId ids
225 -- printDebug "Node Text Ids:" tId
228 (masterUserId, _masterRootId, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
229 ngs <- buildNgramsLists user (GroupParams l 2 3 (StopSize 3)) userCorpusId masterCorpusId
230 _userListId <- flowList_DbRepo listId ngs
231 _mastListId <- getOrMkList masterCorpusId masterUserId
232 -- _ <- insertOccsUpdates userCorpusId mastListId
233 -- printDebug "userListId" userListId
235 _ <- insertDefaultNode NodeDashboard userCorpusId userId
236 _ <- insertDefaultNode NodeGraph userCorpusId userId
237 --_ <- mkPhylo userCorpusId userId
239 -- _ <- mkAnnuaire rootUserId userId
243 insertMasterDocs :: ( FlowCmdM env err m
251 insertMasterDocs c lang hs = do
252 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
253 (ids', documentsWithId) <- insertDocs masterUserId masterCorpusId (map (toNode masterUserId masterCorpusId) hs )
254 _ <- Doc.add masterCorpusId ids'
256 -- create a corpus with database name (CSV or PubMed)
257 -- add documents to the corpus (create node_node link)
258 -- this will enable global database monitoring
260 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
261 mapNgramsDocs <- mapNodeIdNgrams
262 <$> documentIdWithNgrams (extractNgramsT $ withLang lang documentsWithId) documentsWithId
264 terms2id <- insertNgrams $ Map.keys mapNgramsDocs
266 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) mapNgramsDocs
269 lId <- getOrMkList masterCorpusId masterUserId
270 mapCgramsId <- listInsertDb lId toNodeNgramsW'
271 $ map (first _ngramsTerms . second Map.keys)
272 $ Map.toList mapNgramsDocs
274 _return <- insertNodeNodeNgrams2
275 $ catMaybes [ NodeNodeNgrams2 <$> Just nId
276 <*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms'')
277 <*> Just (fromIntegral w :: Double)
278 | (terms'', mapNgramsTypes) <- Map.toList mapNgramsDocs
279 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
280 , (nId, w) <- Map.toList mapNodeIdWeight
283 -- _cooc <- insertDefaultNode NodeListCooc lId masterUserId
285 _ <- insertDocNgrams lId indexedNgrams
288 ------------------------------------------------------------------------
289 -- TODO Type NodeDocumentUnicised
290 insertDocs :: ( FlowCmdM env err m
297 -> m ([DocId], [DocumentWithId a])
298 insertDocs uId cId hs = do
299 let docs = map addUniqId hs
300 newIds <- insertDb uId cId docs
301 -- printDebug "newIds" newIds
303 newIds' = map reId newIds
304 documentsWithId = mergeData (toInserted newIds) (Map.fromList $ map viewUniqId' docs)
305 _ <- Doc.add cId newIds'
306 pure (newIds', documentsWithId)
310 ------------------------------------------------------------------------
311 viewUniqId' :: UniqId a
314 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
316 err = panic "[ERROR] Database.Flow.toInsert"
319 toInserted :: [ReturnId]
322 Map.fromList . map (\r -> (reUniqId r, r) )
323 . filter (\r -> reInserted r == True)
325 mergeData :: Map Hash ReturnId
327 -> [DocumentWithId a]
328 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
330 toDocumentWithId (sha,hpd) =
331 DocumentWithId <$> fmap reId (lookup sha rs)
334 ------------------------------------------------------------------------
335 instance HasText HyperdataContact
338 ------------------------------------------------------------------------
339 ------------------------------------------------------------------------
340 documentIdWithNgrams :: HasNodeError err
342 -> Cmd err (Map Ngrams (Map NgramsType Int)))
343 -> [DocumentWithId a]
344 -> Cmd err [DocumentIdWithNgrams a]
345 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
347 toDocumentIdWithNgrams d = do
348 e <- f $ documentData d
349 pure $ DocumentIdWithNgrams d e
351 ------------------------------------------------------------------------
352 instance ExtractNgramsT HyperdataContact
354 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
356 extract :: TermType Lang -> HyperdataContact
357 -> Cmd err (Map Ngrams (Map NgramsType Int))
359 let authors = map text2ngrams
360 $ maybe ["Nothing"] (\a -> [a])
361 $ view (hc_who . _Just . cw_lastName) hc'
363 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
365 instance HasText HyperdataDocument
367 hasText h = catMaybes [ _hd_title h
372 instance ExtractNgramsT HyperdataDocument
374 extractNgramsT :: TermType Lang
376 -> Cmd err (Map Ngrams (Map NgramsType Int))
377 extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
379 extractNgramsT' :: TermType Lang
381 -> Cmd err (Map Ngrams (Map NgramsType Int))
382 extractNgramsT' lang' doc = do
383 let source = text2ngrams
384 $ maybe "Nothing" identity
387 institutes = map text2ngrams
388 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
391 authors = map text2ngrams
392 $ maybe ["Nothing"] (splitOn ", ")
395 terms' <- map text2ngrams
396 <$> map (intercalate " " . _terms_label)
398 <$> liftBase (extractTerms lang' $ hasText doc)
400 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
401 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
402 <> [(a', Map.singleton Authors 1) | a' <- authors ]
403 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
405 instance (ExtractNgramsT a, HasText a) => ExtractNgramsT (Node a)
407 extractNgramsT l (Node _ _ _ _ _ _ _ h) = extractNgramsT l h
409 instance HasText a => HasText (Node a)
411 hasText (Node _ _ _ _ _ _ _ h) = hasText h