2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
20 {-# LANGUAGE ConstraintKinds #-}
21 {-# LANGUAGE RankNTypes #-}
22 {-# LANGUAGE ConstrainedClassMethods #-}
23 {-# LANGUAGE ConstraintKinds #-}
24 {-# LANGUAGE DeriveGeneric #-}
25 {-# LANGUAGE FlexibleContexts #-}
26 {-# LANGUAGE InstanceSigs #-}
27 {-# LANGUAGE NoImplicitPrelude #-}
28 {-# LANGUAGE OverloadedStrings #-}
30 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
34 , flowCorpusSearchInDatabase
36 , getOrMk_RootWithCorpus
40 import Prelude (String)
42 import Data.Traversable (traverse)
43 import Debug.Trace (trace)
44 import Control.Lens ((^.), view, _Just)
45 import Control.Monad.IO.Class (liftIO)
46 import Data.List (concat)
47 import Data.Map (Map, lookup)
48 import Data.Maybe (Maybe(..), catMaybes)
50 import Data.Text (Text, splitOn, intercalate)
51 import Gargantext.Core (Lang(..))
52 import Gargantext.Core.Types (NodePoly(..), Terms(..))
53 import Gargantext.Core.Types.Individu (Username)
54 import Gargantext.Core.Flow.Types
55 import Gargantext.Core.Types.Main
56 import Gargantext.Database.Config (userMaster, corpusMasterName)
57 import Gargantext.Database.Flow.Utils (insertDocNgrams)
58 import Gargantext.Database.Flow.List
59 import Gargantext.Database.Flow.Types
60 import Gargantext.Database.Node.Contact -- (HyperdataContact(..), ContactWho(..))
61 import Gargantext.Database.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
62 import Gargantext.Database.Root (getRoot)
64 import Gargantext.Database.Schema.Ngrams -- (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
65 import Gargantext.Database.Schema.Node -- (mkRoot, mkCorpus, getOrMkList, mkGraph, {-mkPhylo,-} mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
67 import Gargantext.Database.Schema.User (getUser, UserLight(..))
68 import Gargantext.Database.TextSearch (searchInDatabase)
69 import Gargantext.Database.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
70 import Gargantext.Database.Utils (Cmd)
71 import Gargantext.Ext.IMT (toSchoolName)
72 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
73 import Gargantext.Prelude
74 import Gargantext.Text.Terms.Eleve (buildTries, toToken)
75 import Gargantext.Text.List (buildNgramsLists,StopSize(..))
76 import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat)
77 import qualified Gargantext.Text.Corpus.API.Isidore as Isidore
78 import Gargantext.Text.Terms (TermType(..), tt_lang, extractTerms, uniText)
79 import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
80 import Gargantext.Prelude.Utils hiding (sha)
81 import System.FilePath (FilePath)
82 import qualified Data.List as List
83 import qualified Data.Map as Map
84 import qualified Data.Text as Text
85 import qualified Gargantext.Database.Node.Document.Add as Doc (add)
86 import qualified Gargantext.Text.Corpus.Parsers.GrandDebat as GD
88 ------------------------------------------------------------------------
90 data ApiQuery = ApiIsidoreQuery Text | ApiIsidoreAuth Text
96 -> IO [HyperdataDocument]
97 getDataApi lang limit (ApiIsidoreQuery q) = Isidore.get lang limit (Just q) Nothing
98 getDataApi lang limit (ApiIsidoreAuth q) = Isidore.get lang limit Nothing (Just q)
102 _flowCorpusApi :: ( FlowCmdM env err m)
103 => Username -> Either CorpusName [CorpusId]
108 _flowCorpusApi u n tt l q = do
109 docs <- liftIO $ splitEvery 500 <$> getDataApi (_tt_lang tt) l q
110 flowCorpus u n tt docs
112 ------------------------------------------------------------------------
114 flowAnnuaire :: FlowCmdM env err m
116 -> Either CorpusName [CorpusId]
120 flowAnnuaire u n l filePath = do
121 docs <- liftIO $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
122 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
125 _flowCorpusDebat :: FlowCmdM env err m
126 => Username -> Either CorpusName [CorpusId]
129 _flowCorpusDebat u n l fp = do
130 docs <- liftIO ( splitEvery 500
133 :: IO [[GD.GrandDebatReference ]]
135 flowCorpus u n (Multi FR) (map (map toHyperdataDocument) docs)
137 flowCorpusFile :: FlowCmdM env err m
138 => Username -> Either CorpusName [CorpusId]
139 -> Limit -- Limit the number of docs (for dev purpose)
140 -> TermType Lang -> FileFormat -> FilePath
142 flowCorpusFile u n l la ff fp = do
143 docs <- liftIO ( splitEvery 500
147 flowCorpus u n la (map (map toHyperdataDocument) docs)
149 -- TODO query with complex query
150 flowCorpusSearchInDatabase :: FlowCmdM env err m
155 flowCorpusSearchInDatabase u la q = do
156 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
159 (Nothing :: Maybe HyperdataCorpus)
160 ids <- map fst <$> searchInDatabase cId (stemIt q)
161 flowCorpusUser la u (Left q) (Nothing :: Maybe HyperdataCorpus) ids
165 _flowCorpusSearchInDatabaseApi :: FlowCmdM env err m
170 _flowCorpusSearchInDatabaseApi u la q = do
171 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
174 (Nothing :: Maybe HyperdataCorpus)
175 ids <- map fst <$> searchInDatabase cId (stemIt q)
176 flowCorpusUser la u (Left q) (Nothing :: Maybe HyperdataCorpus) ids
178 ------------------------------------------------------------------------
179 -- | TODO improve the needed type to create/update a corpus
181 data UserInfo = Username Text
183 data CorpusInfo = CorpusName Lang Text
184 | CorpusId Lang NodeId
187 flow :: (FlowCmdM env err m, FlowCorpus a, MkCorpus c)
190 -> Either CorpusName [CorpusId]
194 flow c u cn la docs = do
195 ids <- traverse (insertMasterDocs c la ) docs
196 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
198 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
200 -> Either CorpusName [CorpusId]
204 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
206 ------------------------------------------------------------------------
207 flowCorpusUser :: (FlowCmdM env err m, MkCorpus c)
210 -> Either CorpusName [CorpusId]
214 flowCorpusUser l userName corpusName ctype ids = do
216 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus userName corpusName ctype
217 listId <- getOrMkList userCorpusId userId
218 _cooc <- mkNode NodeListCooc listId userId
219 -- TODO: check if present already, ignore
220 _ <- Doc.add userCorpusId ids
222 _tId <- mkNode NodeTexts userCorpusId userId
223 -- printDebug "Node Text Id" tId
226 (_masterUserId, _masterRootId, masterCorpusId) <- getOrMk_RootWithCorpus userMaster (Left "") ctype
227 ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
228 _userListId <- flowList_DbRepo listId ngs
229 --mastListId <- getOrMkList masterCorpusId masterUserId
230 -- _ <- insertOccsUpdates userCorpusId mastListId
231 -- printDebug "userListId" userListId
233 _ <- mkDashboard userCorpusId userId
234 _ <- mkGraph userCorpusId userId
235 --_ <- mkPhylo userCorpusId userId
238 -- _ <- mkAnnuaire rootUserId userId
242 insertMasterDocs :: ( FlowCmdM env err m
250 insertMasterDocs c lang hs = do
251 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus userMaster (Left corpusMasterName) c
253 -- TODO Type NodeDocumentUnicised
254 let docs = map addUniqId hs
255 ids <- insertDb masterUserId masterCorpusId docs
258 documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' docs)
259 _ <- Doc.add masterCorpusId ids'
261 -- create a corpus with database name (CSV or PubMed)
262 -- add documents to the corpus (create node_node link)
263 -- this will enable global database monitoring
265 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
266 maps <- mapNodeIdNgrams
267 <$> documentIdWithNgrams (extractNgramsT $ withLang lang documentsWithId) documentsWithId
269 lId <- getOrMkList masterCorpusId masterUserId
270 terms2id <- insertNgrams $ Map.keys maps
271 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) maps
273 _cooc <- mkNode NodeListCooc lId masterUserId
274 _ <- insertDocNgrams lId indexedNgrams
279 withLang :: HasText a => TermType Lang
280 -> [DocumentWithId a]
282 withLang (Unsupervised l n s m) ns = Unsupervised l n s m'
285 Nothing -> trace ("buildTries here" :: String)
287 $ buildTries n ( fmap toToken $ uniText
288 $ Text.intercalate " . "
297 type CorpusName = Text
300 getOrMkRoot :: (HasNodeError err)
302 -> Cmd err (UserId, RootId)
303 getOrMkRoot username = do
304 maybeUserId <- getUser username
305 userId <- case maybeUserId of
306 Nothing -> nodeError NoUserFound
307 Just user -> pure $ userLight_id user
309 rootId' <- map _node_id <$> getRoot username
311 rootId'' <- case rootId' of
312 [] -> mkRoot username userId
313 n -> case length n >= 2 of
314 True -> nodeError ManyNodeUsers
315 False -> pure rootId'
317 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
318 pure (userId, rootId)
321 getOrMk_RootWithCorpus :: (HasNodeError err, MkCorpus a)
323 -> Either CorpusName [CorpusId]
325 -> Cmd err (UserId, RootId, CorpusId)
326 getOrMk_RootWithCorpus username cName c = do
327 (userId, rootId) <- getOrMkRoot username
328 corpusId'' <- if username == userMaster
330 ns <- getCorporaWithParentId rootId
331 pure $ map _node_id ns
333 pure $ fromRight [] cName
335 corpusId' <- if corpusId'' /= []
338 c' <- mk (Just $ fromLeft "Default" cName) c rootId userId
339 _tId <- case head c' of
341 Just c'' -> mkNode NodeTexts c'' userId
344 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
346 pure (userId, rootId, corpusId)
349 ------------------------------------------------------------------------
350 viewUniqId' :: UniqId a
353 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
355 err = panic "[ERROR] Database.Flow.toInsert"
358 toInserted :: [ReturnId]
359 -> Map HashId ReturnId
361 Map.fromList . map (\r -> (reUniqId r, r) )
362 . filter (\r -> reInserted r == True)
364 mergeData :: Map HashId ReturnId
366 -> [DocumentWithId a]
367 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
369 toDocumentWithId (sha,hpd) =
370 DocumentWithId <$> fmap reId (lookup sha rs)
373 ------------------------------------------------------------------------
375 instance HasText HyperdataContact
379 instance ExtractNgramsT HyperdataContact
381 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
383 extract :: TermType Lang -> HyperdataContact
384 -> Cmd err (Map Ngrams (Map NgramsType Int))
386 let authors = map text2ngrams
387 $ maybe ["Nothing"] (\a -> [a])
388 $ view (hc_who . _Just . cw_lastName) hc'
390 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
392 instance HasText HyperdataDocument
394 hasText h = catMaybes [ _hyperdataDocument_title h
395 , _hyperdataDocument_abstract h
398 instance ExtractNgramsT HyperdataDocument
400 extractNgramsT :: TermType Lang
402 -> Cmd err (Map Ngrams (Map NgramsType Int))
403 extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
405 extractNgramsT' :: TermType Lang
407 -> Cmd err (Map Ngrams (Map NgramsType Int))
408 extractNgramsT' lang' doc = do
409 let source = text2ngrams
410 $ maybe "Nothing" identity
411 $ _hyperdataDocument_source doc
413 institutes = map text2ngrams
414 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
415 $ _hyperdataDocument_institutes doc
417 authors = map text2ngrams
418 $ maybe ["Nothing"] (splitOn ", ")
419 $ _hyperdataDocument_authors doc
421 terms' <- map text2ngrams
422 <$> map (intercalate " " . _terms_label)
424 <$> liftIO (extractTerms lang' $ hasText doc)
426 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
427 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
428 <> [(a', Map.singleton Authors 1) | a' <- authors ]
429 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
431 filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
432 -> Map Ngrams (Map NgramsType Int)
433 filterNgramsT s ms = Map.fromList $ map (\a -> filter' s a) $ Map.toList ms
435 filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
437 False -> (Ngrams (Text.take s' t) n , y)
440 documentIdWithNgrams :: HasNodeError err
442 -> Cmd err (Map Ngrams (Map NgramsType Int)))
443 -> [DocumentWithId a]
444 -> Cmd err [DocumentIdWithNgrams a]
445 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
447 toDocumentIdWithNgrams d = do
448 e <- f $ documentData d
449 pure $ DocumentIdWithNgrams d e