2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
20 {-# LANGUAGE ConstraintKinds #-}
21 {-# LANGUAGE RankNTypes #-}
22 {-# LANGUAGE ConstrainedClassMethods #-}
23 {-# LANGUAGE ConstraintKinds #-}
24 {-# LANGUAGE DeriveGeneric #-}
25 {-# LANGUAGE FlexibleContexts #-}
26 {-# LANGUAGE InstanceSigs #-}
27 {-# LANGUAGE NoImplicitPrelude #-}
28 {-# LANGUAGE OverloadedStrings #-}
30 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
34 , flowCorpusSearchInDatabase
36 , getOrMk_RootWithCorpus
41 import Prelude (String)
43 import Data.Tuple.Extra (first, second)
44 import Data.Traversable (traverse)
45 import Debug.Trace (trace)
46 import Control.Lens ((^.), view, _Just)
47 import Data.List (concat)
48 import Data.Map (Map, lookup)
49 import Data.Maybe (Maybe(..), catMaybes)
51 import Data.Text (Text, splitOn, intercalate)
52 import Gargantext.Core (Lang(..))
53 import Gargantext.Core.Types (NodePoly(..), Terms(..))
54 import Gargantext.Core.Types.Individu (Username)
55 import Gargantext.Core.Flow.Types
56 import Gargantext.Core.Types.Main
57 import Gargantext.Database.Config (userMaster, corpusMasterName)
58 import Gargantext.Database.Flow.Utils (insertDocNgrams)
59 import Gargantext.Database.Flow.List
60 import Gargantext.Database.Flow.Types
61 import Gargantext.Database.Node.Contact -- (HyperdataContact(..), ContactWho(..))
62 import Gargantext.Database.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
63 import Gargantext.Database.Root (getRoot)
65 import Gargantext.Database.Schema.Ngrams -- (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
66 import Gargantext.Database.Schema.Node -- (mkRoot, mkCorpus, getOrMkList, mkGraph, {-mkPhylo,-} mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
67 import Gargantext.Database.Schema.NodeNgrams (listInsertDb , getCgramsId)
68 import Gargantext.Database.Schema.NodeNodeNgrams2 -- (NodeNodeNgrams2, insertNodeNodeNgrams2)
69 import Gargantext.Database.Schema.User (getUser, UserLight(..))
70 import Gargantext.Database.TextSearch (searchInDatabase)
71 import Gargantext.Database.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
72 import Gargantext.Database.Utils (Cmd)
73 import Gargantext.Ext.IMT (toSchoolName)
74 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
75 import Gargantext.Prelude
76 import Gargantext.Text.Terms.Eleve (buildTries, toToken)
77 import Gargantext.Text.List (buildNgramsLists,StopSize(..))
78 import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat)
79 import qualified Gargantext.Text.Corpus.API.Isidore as Isidore
80 import Gargantext.Text.Terms (TermType(..), tt_lang, extractTerms, uniText)
81 import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
82 import Gargantext.Prelude.Utils hiding (sha)
83 import System.FilePath (FilePath)
84 import qualified Data.List as List
85 import qualified Data.Map as Map
86 import qualified Data.Text as Text
87 import qualified Gargantext.Database.Node.Document.Add as Doc (add)
88 import qualified Gargantext.Text.Corpus.Parsers.GrandDebat as GD
90 ------------------------------------------------------------------------
92 data ApiQuery = ApiIsidoreQuery Text | ApiIsidoreAuth Text
98 -> IO [HyperdataDocument]
99 getDataApi lang limit (ApiIsidoreQuery q) = Isidore.get lang limit (Just q) Nothing
100 getDataApi lang limit (ApiIsidoreAuth q) = Isidore.get lang limit Nothing (Just q)
104 _flowCorpusApi :: ( FlowCmdM env err m)
105 => Username -> Either CorpusName [CorpusId]
110 _flowCorpusApi u n tt l q = do
111 docs <- liftBase $ splitEvery 500 <$> getDataApi (_tt_lang tt) l q
112 flowCorpus u n tt docs
114 ------------------------------------------------------------------------
116 flowAnnuaire :: FlowCmdM env err m
118 -> Either CorpusName [CorpusId]
122 flowAnnuaire u n l filePath = do
123 docs <- liftBase $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
124 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
127 _flowCorpusDebat :: FlowCmdM env err m
128 => Username -> Either CorpusName [CorpusId]
131 _flowCorpusDebat u n l fp = do
132 docs <- liftBase ( splitEvery 500
135 :: IO [[GD.GrandDebatReference ]]
137 flowCorpus u n (Multi FR) (map (map toHyperdataDocument) docs)
139 flowCorpusFile :: FlowCmdM env err m
140 => Username -> Either CorpusName [CorpusId]
141 -> Limit -- Limit the number of docs (for dev purpose)
142 -> TermType Lang -> FileFormat -> FilePath
144 flowCorpusFile u n l la ff fp = do
145 docs <- liftBase ( splitEvery 500
149 flowCorpus u n la (map (map toHyperdataDocument) docs)
151 -- TODO query with complex query
152 flowCorpusSearchInDatabase :: FlowCmdM env err m
157 flowCorpusSearchInDatabase u la q = do
158 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
161 (Nothing :: Maybe HyperdataCorpus)
162 ids <- map fst <$> searchInDatabase cId (stemIt q)
163 flowCorpusUser la u (Left q) (Nothing :: Maybe HyperdataCorpus) ids
167 _flowCorpusSearchInDatabaseApi :: FlowCmdM env err m
172 _flowCorpusSearchInDatabaseApi u la q = do
173 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
176 (Nothing :: Maybe HyperdataCorpus)
177 ids <- map fst <$> searchInDatabase cId (stemIt q)
178 flowCorpusUser la u (Left q) (Nothing :: Maybe HyperdataCorpus) ids
180 ------------------------------------------------------------------------
181 -- | TODO improve the needed type to create/update a corpus
183 data UserInfo = Username Text
185 data CorpusInfo = CorpusName Lang Text
186 | CorpusId Lang NodeId
189 flow :: (FlowCmdM env err m, FlowCorpus a, MkCorpus c)
192 -> Either CorpusName [CorpusId]
196 flow c u cn la docs = do
197 ids <- traverse (insertMasterDocs c la ) docs
198 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
200 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
202 -> Either CorpusName [CorpusId]
206 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
208 ------------------------------------------------------------------------
209 flowCorpusUser :: (FlowCmdM env err m, MkCorpus c)
212 -> Either CorpusName [CorpusId]
216 flowCorpusUser l userName corpusName ctype ids = do
218 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus userName corpusName ctype
219 listId <- getOrMkList userCorpusId userId
220 _cooc <- mkNode NodeListCooc listId userId
221 -- TODO: check if present already, ignore
222 _ <- Doc.add userCorpusId ids
224 _tId <- mkNode NodeTexts userCorpusId userId
225 -- printDebug "Node Text Id" tId
228 (masterUserId, _masterRootId, masterCorpusId) <- getOrMk_RootWithCorpus userMaster (Left "") ctype
229 ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
230 _userListId <- flowList_DbRepo listId ngs
231 _mastListId <- getOrMkList masterCorpusId masterUserId
232 -- _ <- insertOccsUpdates userCorpusId mastListId
233 -- printDebug "userListId" userListId
235 _ <- mkDashboard userCorpusId userId
236 _ <- mkGraph userCorpusId userId
237 --_ <- mkPhylo userCorpusId userId
240 -- _ <- mkAnnuaire rootUserId userId
244 insertMasterDocs :: ( FlowCmdM env err m
252 insertMasterDocs c lang hs = do
253 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus userMaster (Left corpusMasterName) c
255 -- TODO Type NodeDocumentUnicised
256 let docs = map addUniqId hs
257 ids <- insertDb masterUserId masterCorpusId docs
260 documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' docs)
262 -- create a corpus with database name (CSV or PubMed)
263 -- add documents to the corpus (create node_node link)
264 -- this will enable global database monitoring
266 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
267 maps <- mapNodeIdNgrams
268 <$> documentIdWithNgrams (extractNgramsT $ withLang lang documentsWithId) documentsWithId
270 terms2id <- insertNgrams $ Map.keys maps
272 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) maps
275 lId <- getOrMkList masterCorpusId masterUserId
276 mapCgramsId <- listInsertDb lId toNodeNgramsW'
277 $ map (first _ngramsTerms . second Map.keys)
280 _return <- insertNodeNodeNgrams2
281 $ catMaybes [ NodeNodeNgrams2 <$> Just nId
282 <*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms)
283 <*> Just (fromIntegral w :: Double)
284 | (terms, mapNgramsTypes) <- Map.toList maps
285 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
286 , (nId, w) <- Map.toList mapNodeIdWeight
289 _ <- Doc.add masterCorpusId ids'
290 _cooc <- mkNode NodeListCooc lId masterUserId
292 _ <- insertDocNgrams lId indexedNgrams
297 withLang :: HasText a => TermType Lang
298 -> [DocumentWithId a]
300 withLang (Unsupervised l n s m) ns = Unsupervised l n s m'
303 Nothing -> trace ("buildTries here" :: String)
305 $ buildTries n ( fmap toToken $ uniText
306 $ Text.intercalate " . "
315 type CorpusName = Text
318 getOrMkRoot :: (HasNodeError err)
320 -> Cmd err (UserId, RootId)
321 getOrMkRoot username = do
322 maybeUserId <- getUser username
323 userId <- case maybeUserId of
324 Nothing -> nodeError NoUserFound
325 Just user -> pure $ userLight_id user
327 rootId' <- map _node_id <$> getRoot username
329 rootId'' <- case rootId' of
330 [] -> mkRoot username userId
331 n -> case length n >= 2 of
332 True -> nodeError ManyNodeUsers
333 False -> pure rootId'
335 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
336 pure (userId, rootId)
339 getOrMk_RootWithCorpus :: (HasNodeError err, MkCorpus a)
341 -> Either CorpusName [CorpusId]
343 -> Cmd err (UserId, RootId, CorpusId)
344 getOrMk_RootWithCorpus username cName c = do
345 (userId, rootId) <- getOrMkRoot username
346 corpusId'' <- if username == userMaster
348 ns <- getCorporaWithParentId rootId
349 pure $ map _node_id ns
351 pure $ fromRight [] cName
353 corpusId' <- if corpusId'' /= []
356 c' <- mk (Just $ fromLeft "Default" cName) c rootId userId
357 _tId <- case head c' of
359 Just c'' -> mkNode NodeTexts c'' userId
362 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
363 pure (userId, rootId, corpusId)
366 ------------------------------------------------------------------------
367 viewUniqId' :: UniqId a
370 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
372 err = panic "[ERROR] Database.Flow.toInsert"
375 toInserted :: [ReturnId]
376 -> Map HashId ReturnId
378 Map.fromList . map (\r -> (reUniqId r, r) )
379 . filter (\r -> reInserted r == True)
381 mergeData :: Map HashId ReturnId
383 -> [DocumentWithId a]
384 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
386 toDocumentWithId (sha,hpd) =
387 DocumentWithId <$> fmap reId (lookup sha rs)
390 ------------------------------------------------------------------------
392 instance HasText HyperdataContact
396 instance ExtractNgramsT HyperdataContact
398 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
400 extract :: TermType Lang -> HyperdataContact
401 -> Cmd err (Map Ngrams (Map NgramsType Int))
403 let authors = map text2ngrams
404 $ maybe ["Nothing"] (\a -> [a])
405 $ view (hc_who . _Just . cw_lastName) hc'
407 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
409 instance HasText HyperdataDocument
411 hasText h = catMaybes [ _hyperdataDocument_title h
412 , _hyperdataDocument_abstract h
415 instance ExtractNgramsT HyperdataDocument
417 extractNgramsT :: TermType Lang
419 -> Cmd err (Map Ngrams (Map NgramsType Int))
420 extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
422 extractNgramsT' :: TermType Lang
424 -> Cmd err (Map Ngrams (Map NgramsType Int))
425 extractNgramsT' lang' doc = do
426 let source = text2ngrams
427 $ maybe "Nothing" identity
428 $ _hyperdataDocument_source doc
430 institutes = map text2ngrams
431 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
432 $ _hyperdataDocument_institutes doc
434 authors = map text2ngrams
435 $ maybe ["Nothing"] (splitOn ", ")
436 $ _hyperdataDocument_authors doc
438 terms' <- map text2ngrams
439 <$> map (intercalate " " . _terms_label)
441 <$> liftBase (extractTerms lang' $ hasText doc)
443 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
444 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
445 <> [(a', Map.singleton Authors 1) | a' <- authors ]
446 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
448 filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
449 -> Map Ngrams (Map NgramsType Int)
450 filterNgramsT s ms = Map.fromList $ map (\a -> filter' s a) $ Map.toList ms
452 filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
454 False -> (Ngrams (Text.take s' t) n , y)
457 documentIdWithNgrams :: HasNodeError err
459 -> Cmd err (Map Ngrams (Map NgramsType Int)))
460 -> [DocumentWithId a]
461 -> Cmd err [DocumentIdWithNgrams a]
462 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
464 toDocumentIdWithNgrams d = do
465 e <- f $ documentData d
466 pure $ DocumentIdWithNgrams d e