2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
20 {-# LANGUAGE ConstraintKinds #-}
21 {-# LANGUAGE ConstrainedClassMethods #-}
22 {-# LANGUAGE ConstraintKinds #-}
23 {-# LANGUAGE InstanceSigs #-}
24 {-# LANGUAGE TemplateHaskell #-}
26 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
37 , getOrMk_RootWithCorpus
46 import Control.Lens ((^.), view, _Just, makeLenses)
47 import Data.Aeson.TH (deriveJSON)
49 import Data.List (concat)
50 import qualified Data.Map as Map
51 import Data.Map (Map, lookup)
52 import Data.Maybe (Maybe(..), catMaybes)
55 import Data.Text (splitOn, intercalate)
56 import Data.Traversable (traverse)
57 import Data.Tuple.Extra (first, second)
58 import GHC.Generics (Generic)
59 import System.FilePath (FilePath)
61 import Gargantext.Core (Lang(..))
62 import Gargantext.Core.Flow.Types
63 import Gargantext.Core.Types (Terms(..))
64 import Gargantext.Core.Types.Individu (User(..))
65 import Gargantext.Core.Types.Main
66 import Gargantext.Database.Action.Flow.List
67 import Gargantext.Database.Action.Flow.Types
68 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams)
69 import Gargantext.Database.Query.Table.Node
70 import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
71 import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
72 import Gargantext.Database.Action.Search (searchDocInDatabase)
73 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
74 import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
75 import Gargantext.Database.Admin.Types.Hyperdata
76 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
77 import Gargantext.Database.Prelude
78 import Gargantext.Database.Query.Table.Ngrams
79 import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
80 import Gargantext.Database.Query.Table.NodeNodeNgrams2
81 import Gargantext.Database.Schema.Node (NodePoly(..))
82 import Gargantext.Core.Ext.IMT (toSchoolName)
83 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
84 import Gargantext.Core.Ext.IMTUser (deserialiseImtUsersFromFile)
85 import Gargantext.Core.Text
86 import Gargantext.Prelude
87 import Gargantext.Prelude.Crypto.Hash (Hash)
88 import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat)
89 import Gargantext.Core.Text.List (buildNgramsLists,StopSize(..))
90 import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
91 import Gargantext.Core.Text.Terms
92 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
93 import qualified Gargantext.Core.Text.Corpus.API as API
95 ------------------------------------------------------------------------
96 -- TODO use internal with API name (could be old data)
97 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
98 | ExternalOrigin { _do_api :: API.ExternalAPIs }
100 deriving (Generic, Eq)
102 makeLenses ''DataOrigin
103 deriveJSON (unPrefix "_do_") ''DataOrigin
104 instance ToSchema DataOrigin where
105 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
107 allDataOrigins :: [DataOrigin]
108 allDataOrigins = map InternalOrigin API.externalAPIs
109 <> map ExternalOrigin API.externalAPIs
112 data DataText = DataOld ![NodeId]
113 | DataNew ![[HyperdataDocument]]
115 -- TODO use the split parameter in config file
116 getDataText :: FlowCmdM env err m
122 getDataText (ExternalOrigin api) la q li = liftBase $ DataNew
124 <$> API.get api (_tt_lang la) q li
125 getDataText (InternalOrigin _) _la q _li = do
126 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
127 (UserName userMaster)
129 (Nothing :: Maybe HyperdataCorpus)
130 ids <- map fst <$> searchDocInDatabase cId (stemIt q)
133 -------------------------------------------------------------------------------
134 flowDataText :: FlowCmdM env err m
140 flowDataText u (DataOld ids) tt cid = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids
142 corpusType = (Nothing :: Maybe HyperdataCorpus)
143 flowDataText u (DataNew txt) tt cid = flowCorpus u (Right [cid]) tt txt
145 ------------------------------------------------------------------------
147 flowAnnuaire :: FlowCmdM env err m
149 -> Either CorpusName [CorpusId]
153 flowAnnuaire u n l filePath = do
154 docs <- liftBase $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
155 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
157 ------------------------------------------------------------------------
158 flowCorpusFile :: FlowCmdM env err m
160 -> Either CorpusName [CorpusId]
161 -> Limit -- Limit the number of docs (for dev purpose)
162 -> TermType Lang -> FileFormat -> FilePath
164 flowCorpusFile u n l la ff fp = do
165 docs <- liftBase ( splitEvery 500
169 flowCorpus u n la (map (map toHyperdataDocument) docs)
171 ------------------------------------------------------------------------
172 -- | TODO improve the needed type to create/update a corpus
173 -- (For now, Either is enough)
174 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
176 -> Either CorpusName [CorpusId]
180 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
183 flow :: (FlowCmdM env err m, FlowCorpus a, MkCorpus c)
186 -> Either CorpusName [CorpusId]
190 flow c u cn la docs = do
191 -- TODO if public insertMasterDocs else insertUserDocs
192 ids <- traverse (insertMasterDocs c la) docs
193 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
195 ------------------------------------------------------------------------
196 flowCorpusUser :: (FlowCmdM env err m, MkCorpus c)
199 -> Either CorpusName [CorpusId]
203 flowCorpusUser l user corpusName ctype ids = do
205 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
206 listId <- getOrMkList userCorpusId userId
207 _cooc <- insertDefaultNode NodeListCooc listId userId
208 -- TODO: check if present already, ignore
209 _ <- Doc.add userCorpusId ids
211 _tId <- insertDefaultNode NodeTexts userCorpusId userId
212 -- printDebug "Node Text Ids:" tId
215 (masterUserId, _masterRootId, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
216 ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
217 _userListId <- flowList_DbRepo listId ngs
218 _mastListId <- getOrMkList masterCorpusId masterUserId
219 -- _ <- insertOccsUpdates userCorpusId mastListId
220 -- printDebug "userListId" userListId
222 _ <- insertDefaultNode NodeDashboard userCorpusId userId
223 _ <- insertDefaultNode NodeGraph userCorpusId userId
224 --_ <- mkPhylo userCorpusId userId
226 -- _ <- mkAnnuaire rootUserId userId
229 -- TODO Type NodeDocumentUnicised
230 insertDocs :: ( FlowCmdM env err m
237 -> m ([DocId], [DocumentWithId a])
238 insertDocs uId cId hs = do
239 let docs = map addUniqId hs
240 newIds <- insertDb uId cId docs
241 -- printDebug "newIds" newIds
243 newIds' = map reId newIds
244 documentsWithId = mergeData (toInserted newIds) (Map.fromList $ map viewUniqId' docs)
245 _ <- Doc.add cId newIds'
246 pure (newIds', documentsWithId)
249 insertMasterDocs :: ( FlowCmdM env err m
257 insertMasterDocs c lang hs = do
258 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
259 (ids', documentsWithId) <- insertDocs masterUserId masterCorpusId (map (toNode masterUserId masterCorpusId) hs )
260 _ <- Doc.add masterCorpusId ids'
262 -- create a corpus with database name (CSV or PubMed)
263 -- add documents to the corpus (create node_node link)
264 -- this will enable global database monitoring
266 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
267 mapNgramsDocs <- mapNodeIdNgrams
268 <$> documentIdWithNgrams (extractNgramsT $ withLang lang documentsWithId) documentsWithId
270 terms2id <- insertNgrams $ Map.keys mapNgramsDocs
272 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) mapNgramsDocs
275 lId <- getOrMkList masterCorpusId masterUserId
276 mapCgramsId <- listInsertDb lId toNodeNgramsW'
277 $ map (first _ngramsTerms . second Map.keys)
278 $ Map.toList mapNgramsDocs
280 _return <- insertNodeNodeNgrams2
281 $ catMaybes [ NodeNodeNgrams2 <$> Just nId
282 <*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms'')
283 <*> Just (fromIntegral w :: Double)
284 | (terms'', mapNgramsTypes) <- Map.toList mapNgramsDocs
285 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
286 , (nId, w) <- Map.toList mapNodeIdWeight
289 _cooc <- insertDefaultNode NodeListCooc lId masterUserId
291 _ <- insertDocNgrams lId indexedNgrams
294 ------------------------------------------------------------------------
295 ------------------------------------------------------------------------
296 viewUniqId' :: UniqId a
299 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
301 err = panic "[ERROR] Database.Flow.toInsert"
304 toInserted :: [ReturnId]
307 Map.fromList . map (\r -> (reUniqId r, r) )
308 . filter (\r -> reInserted r == True)
310 mergeData :: Map Hash ReturnId
312 -> [DocumentWithId a]
313 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
315 toDocumentWithId (sha,hpd) =
316 DocumentWithId <$> fmap reId (lookup sha rs)
319 ------------------------------------------------------------------------
320 instance HasText HyperdataContact
323 ------------------------------------------------------------------------
324 ------------------------------------------------------------------------
325 documentIdWithNgrams :: HasNodeError err
327 -> Cmd err (Map Ngrams (Map NgramsType Int)))
328 -> [DocumentWithId a]
329 -> Cmd err [DocumentIdWithNgrams a]
330 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
332 toDocumentIdWithNgrams d = do
333 e <- f $ documentData d
334 pure $ DocumentIdWithNgrams d e
336 ------------------------------------------------------------------------
337 instance ExtractNgramsT HyperdataContact
339 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
341 extract :: TermType Lang -> HyperdataContact
342 -> Cmd err (Map Ngrams (Map NgramsType Int))
344 let authors = map text2ngrams
345 $ maybe ["Nothing"] (\a -> [a])
346 $ view (hc_who . _Just . cw_lastName) hc'
348 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
350 instance HasText HyperdataDocument
352 hasText h = catMaybes [ _hd_title h
357 instance ExtractNgramsT HyperdataDocument
359 extractNgramsT :: TermType Lang
361 -> Cmd err (Map Ngrams (Map NgramsType Int))
362 extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
364 extractNgramsT' :: TermType Lang
366 -> Cmd err (Map Ngrams (Map NgramsType Int))
367 extractNgramsT' lang' doc = do
368 let source = text2ngrams
369 $ maybe "Nothing" identity
372 institutes = map text2ngrams
373 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
376 authors = map text2ngrams
377 $ maybe ["Nothing"] (splitOn ", ")
380 terms' <- map text2ngrams
381 <$> map (intercalate " " . _terms_label)
383 <$> liftBase (extractTerms lang' $ hasText doc)
385 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
386 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
387 <> [(a', Map.singleton Authors 1) | a' <- authors ]
388 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
390 instance (ExtractNgramsT a, HasText a) => ExtractNgramsT (Node a)
392 extractNgramsT l (Node _ _ _ _ _ _ _ h) = extractNgramsT l h
394 instance HasText a => HasText (Node a)
396 hasText (Node _ _ _ _ _ _ _ h) = hasText h