2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
20 {-# LANGUAGE ConstraintKinds #-}
21 {-# LANGUAGE ConstrainedClassMethods #-}
22 {-# LANGUAGE ConstraintKinds #-}
23 {-# LANGUAGE InstanceSigs #-}
24 {-# LANGUAGE TemplateHaskell #-}
26 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
36 , getOrMk_RootWithCorpus
45 import Control.Lens ((^.), view, _Just, makeLenses)
46 import Data.Aeson.TH (deriveJSON)
48 import Data.List (concat)
49 import qualified Data.Map as Map
50 import Data.Map (Map, lookup)
51 import Data.Maybe (Maybe(..), catMaybes)
54 import Data.Text (splitOn, intercalate)
55 import Data.Traversable (traverse)
56 import Data.Tuple.Extra (first, second)
57 import GHC.Generics (Generic)
58 import System.FilePath (FilePath)
60 import Gargantext.Core (Lang(..))
61 import Gargantext.Core.Flow.Types
62 import Gargantext.Core.Types (Terms(..))
63 import Gargantext.Core.Types.Individu (User(..))
64 import Gargantext.Core.Types.Main
65 import Gargantext.Database.Action.Flow.List
66 import Gargantext.Database.Action.Flow.Types
67 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams)
68 import Gargantext.Database.Query.Table.Node
69 import Gargantext.Database.Query.Table.Node.Contact -- (HyperdataContact(..), ContactWho(..))
70 import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
71 import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
72 import Gargantext.Database.Action.Search (searchInDatabase)
73 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
74 import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
75 import Gargantext.Database.Admin.Types.Hyperdata
76 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
77 import Gargantext.Database.Prelude
78 import Gargantext.Database.Query.Table.Ngrams
79 import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
80 import Gargantext.Database.Query.Table.NodeNodeNgrams2
81 import Gargantext.Ext.IMT (toSchoolName)
82 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
83 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
84 import Gargantext.Text
85 import Gargantext.Prelude
86 import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat)
87 import Gargantext.Text.List (buildNgramsLists,StopSize(..))
88 import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
89 import Gargantext.Text.Terms
90 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
91 import qualified Gargantext.Text.Corpus.API as API
93 ------------------------------------------------------------------------
94 -- TODO use internal with API name (could be old data)
95 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
96 | ExternalOrigin { _do_api :: API.ExternalAPIs }
98 deriving (Generic, Eq)
100 makeLenses ''DataOrigin
101 deriveJSON (unPrefix "_do_") ''DataOrigin
102 instance ToSchema DataOrigin where
103 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
105 allDataOrigins :: [DataOrigin]
106 allDataOrigins = map InternalOrigin API.externalAPIs
107 <> map ExternalOrigin API.externalAPIs
111 data DataText = DataOld ![NodeId]
112 | DataNew ![[HyperdataDocument]]
115 -- TODO use the split parameter in config file
116 getDataText :: FlowCmdM env err m
122 getDataText (ExternalOrigin api) la q li = liftBase $ DataNew
124 <$> API.get api (_tt_lang la) q li
125 getDataText (InternalOrigin _) _la q _li = do
126 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
127 (UserName userMaster)
129 (Nothing :: Maybe HyperdataCorpus)
130 ids <- map fst <$> searchInDatabase cId (stemIt q)
133 -------------------------------------------------------------------------------
134 flowDataText :: FlowCmdM env err m
140 flowDataText u (DataOld ids) tt cid = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids
142 corpusType = (Nothing :: Maybe HyperdataCorpus)
143 flowDataText u (DataNew txt) tt cid = flowCorpus u (Right [cid]) tt txt
145 ------------------------------------------------------------------------
147 flowAnnuaire :: FlowCmdM env err m
149 -> Either CorpusName [CorpusId]
153 flowAnnuaire u n l filePath = do
154 docs <- liftBase $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
155 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
157 ------------------------------------------------------------------------
158 flowCorpusFile :: FlowCmdM env err m
160 -> Either CorpusName [CorpusId]
161 -> Limit -- Limit the number of docs (for dev purpose)
162 -> TermType Lang -> FileFormat -> FilePath
164 flowCorpusFile u n l la ff fp = do
165 docs <- liftBase ( splitEvery 500
169 flowCorpus u n la (map (map toHyperdataDocument) docs)
171 ------------------------------------------------------------------------
172 -- | TODO improve the needed type to create/update a corpus
173 -- (For now, Either is enough)
174 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
176 -> Either CorpusName [CorpusId]
180 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
183 flow :: (FlowCmdM env err m, FlowCorpus a, MkCorpus c)
186 -> Either CorpusName [CorpusId]
190 flow c u cn la docs = do
191 ids <- traverse (insertMasterDocs c la) docs
192 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
194 ------------------------------------------------------------------------
195 flowCorpusUser :: (FlowCmdM env err m, MkCorpus c)
198 -> Either CorpusName [CorpusId]
202 flowCorpusUser l user corpusName ctype ids = do
204 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
205 listId <- getOrMkList userCorpusId userId
206 _cooc <- mkNode NodeListCooc listId userId
207 -- TODO: check if present already, ignore
208 _ <- Doc.add userCorpusId ids
210 _tId <- mkNode NodeTexts userCorpusId userId
211 -- printDebug "Node Text Id" tId
214 (masterUserId, _masterRootId, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
215 ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
216 _userListId <- flowList_DbRepo listId ngs
217 _mastListId <- getOrMkList masterCorpusId masterUserId
218 -- _ <- insertOccsUpdates userCorpusId mastListId
219 -- printDebug "userListId" userListId
221 _ <- mkDashboard userCorpusId userId
222 _ <- mkGraph userCorpusId userId
223 --_ <- mkPhylo userCorpusId userId
226 -- _ <- mkAnnuaire rootUserId userId
230 insertMasterDocs :: ( FlowCmdM env err m
238 insertMasterDocs c lang hs = do
239 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
241 -- TODO Type NodeDocumentUnicised
242 let docs = map addUniqId hs
243 ids <- insertDb masterUserId masterCorpusId docs
246 documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' docs)
248 -- create a corpus with database name (CSV or PubMed)
249 -- add documents to the corpus (create node_node link)
250 -- this will enable global database monitoring
252 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
253 maps <- mapNodeIdNgrams
254 <$> documentIdWithNgrams (extractNgramsT $ withLang lang documentsWithId) documentsWithId
256 terms2id <- insertNgrams $ Map.keys maps
258 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) maps
261 lId <- getOrMkList masterCorpusId masterUserId
262 mapCgramsId <- listInsertDb lId toNodeNgramsW'
263 $ map (first _ngramsTerms . second Map.keys)
266 _return <- insertNodeNodeNgrams2
267 $ catMaybes [ NodeNodeNgrams2 <$> Just nId
268 <*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms'')
269 <*> Just (fromIntegral w :: Double)
270 | (terms'', mapNgramsTypes) <- Map.toList maps
271 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
272 , (nId, w) <- Map.toList mapNodeIdWeight
275 _ <- Doc.add masterCorpusId ids'
276 _cooc <- mkNode NodeListCooc lId masterUserId
278 _ <- insertDocNgrams lId indexedNgrams
283 ------------------------------------------------------------------------
287 ------------------------------------------------------------------------
288 viewUniqId' :: UniqId a
291 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
293 err = panic "[ERROR] Database.Flow.toInsert"
296 toInserted :: [ReturnId]
297 -> Map HashId ReturnId
299 Map.fromList . map (\r -> (reUniqId r, r) )
300 . filter (\r -> reInserted r == True)
302 mergeData :: Map HashId ReturnId
304 -> [DocumentWithId a]
305 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
307 toDocumentWithId (sha,hpd) =
308 DocumentWithId <$> fmap reId (lookup sha rs)
311 ------------------------------------------------------------------------
313 instance HasText HyperdataContact
317 ------------------------------------------------------------------------
318 ------------------------------------------------------------------------
320 documentIdWithNgrams :: HasNodeError err
322 -> Cmd err (Map Ngrams (Map NgramsType Int)))
323 -> [DocumentWithId a]
324 -> Cmd err [DocumentIdWithNgrams a]
325 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
327 toDocumentIdWithNgrams d = do
328 e <- f $ documentData d
329 pure $ DocumentIdWithNgrams d e
332 ------------------------------------------------------------------------
335 instance ExtractNgramsT HyperdataContact
337 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
339 extract :: TermType Lang -> HyperdataContact
340 -> Cmd err (Map Ngrams (Map NgramsType Int))
342 let authors = map text2ngrams
343 $ maybe ["Nothing"] (\a -> [a])
344 $ view (hc_who . _Just . cw_lastName) hc'
346 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
348 instance HasText HyperdataDocument
350 hasText h = catMaybes [ _hyperdataDocument_title h
351 , _hyperdataDocument_abstract h
354 instance ExtractNgramsT HyperdataDocument
356 extractNgramsT :: TermType Lang
358 -> Cmd err (Map Ngrams (Map NgramsType Int))
359 extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
361 extractNgramsT' :: TermType Lang
363 -> Cmd err (Map Ngrams (Map NgramsType Int))
364 extractNgramsT' lang' doc = do
365 let source = text2ngrams
366 $ maybe "Nothing" identity
367 $ _hyperdataDocument_source doc
369 institutes = map text2ngrams
370 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
371 $ _hyperdataDocument_institutes doc
373 authors = map text2ngrams
374 $ maybe ["Nothing"] (splitOn ", ")
375 $ _hyperdataDocument_authors doc
377 terms' <- map text2ngrams
378 <$> map (intercalate " " . _terms_label)
380 <$> liftBase (extractTerms lang' $ hasText doc)
382 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
383 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
384 <> [(a', Map.singleton Authors 1) | a' <- authors ]
385 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]