2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
20 {-# LANGUAGE ConstraintKinds #-}
21 {-# LANGUAGE RankNTypes #-}
22 {-# LANGUAGE ConstrainedClassMethods #-}
23 {-# LANGUAGE ConstraintKinds #-}
24 {-# LANGUAGE DeriveGeneric #-}
25 {-# LANGUAGE FlexibleContexts #-}
26 {-# LANGUAGE InstanceSigs #-}
27 {-# LANGUAGE NoImplicitPrelude #-}
28 {-# LANGUAGE OverloadedStrings #-}
30 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
34 , flowCorpusSearchInDatabase
36 , getOrMk_RootWithCorpus
41 import Control.Lens ((^.), view, _Just)
43 import Data.List (concat)
44 import Data.Map (Map, lookup)
45 import Data.Maybe (Maybe(..), catMaybes)
47 import Data.Text (Text, splitOn, intercalate)
48 import Data.Traversable (traverse)
49 import Data.Tuple.Extra (first, second)
50 import Debug.Trace (trace)
51 import Gargantext.Core (Lang(..))
52 import Gargantext.Core.Flow.Types
53 import Gargantext.Core.Types (NodePoly(..), Terms(..))
54 import Gargantext.Core.Types.Individu (User(..))
55 import Gargantext.Core.Types.Main
56 import Gargantext.Database.Action.Flow.List
57 import Gargantext.Database.Action.Flow.Types
58 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams, getUserId)
59 import Gargantext.Database.Action.Query.Node
60 import Gargantext.Database.Action.Query.User
61 import Gargantext.Database.Action.Query.Node.Contact -- (HyperdataContact(..), ContactWho(..))
62 import Gargantext.Database.Action.Query.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
63 import Gargantext.Database.Action.Query.Tree.Root (getRoot)
64 import Gargantext.Database.Action.Query.Tree (mkRoot)
65 import Gargantext.Database.Action.Search (searchInDatabase)
66 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
67 import Gargantext.Database.Admin.Types.Errors (HasNodeError(..), NodeError(..), nodeError)
68 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
69 import Gargantext.Database.Admin.Utils (Cmd)
70 import Gargantext.Database.Schema.Ngrams -- (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
71 import Gargantext.Database.Schema.Node -- (mkRoot, mkCorpus, getOrMkList, mkGraph, {-mkPhylo,-} mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
72 import Gargantext.Database.Schema.NodeNgrams (listInsertDb , getCgramsId)
73 import Gargantext.Database.Schema.NodeNodeNgrams2 -- (NodeNodeNgrams2, insertNodeNodeNgrams2)
74 import Gargantext.Ext.IMT (toSchoolName)
75 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
76 import Gargantext.Prelude
77 import Gargantext.Prelude.Utils hiding (sha)
78 import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat)
79 import Gargantext.Text.List (buildNgramsLists,StopSize(..))
80 import Gargantext.Text.Terms (TermType(..), tt_lang, extractTerms, uniText)
81 import Gargantext.Text.Terms.Eleve (buildTries, toToken)
82 import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
83 import Prelude (String)
84 import System.FilePath (FilePath)
85 import qualified Data.List as List
86 import qualified Data.Map as Map
87 import qualified Data.Text as Text
88 import qualified Gargantext.Database.Action.Query.Node.Document.Add as Doc (add)
89 import qualified Gargantext.Text.Corpus.API.Isidore as Isidore
90 import qualified Gargantext.Text.Corpus.Parsers.GrandDebat as GD
92 ------------------------------------------------------------------------
94 data ApiQuery = ApiIsidoreQuery Text | ApiIsidoreAuth Text
100 -> IO [HyperdataDocument]
101 getDataApi lang limit (ApiIsidoreQuery q) = Isidore.get lang limit (Just q) Nothing
102 getDataApi lang limit (ApiIsidoreAuth q) = Isidore.get lang limit Nothing (Just q)
106 _flowCorpusApi :: ( FlowCmdM env err m)
107 => User -> Either CorpusName [CorpusId]
112 _flowCorpusApi u n tt l q = do
113 docs <- liftBase $ splitEvery 500 <$> getDataApi (_tt_lang tt) l q
114 flowCorpus u n tt docs
116 ------------------------------------------------------------------------
118 flowAnnuaire :: FlowCmdM env err m
120 -> Either CorpusName [CorpusId]
124 flowAnnuaire u n l filePath = do
125 docs <- liftBase $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
126 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
129 _flowCorpusDebat :: FlowCmdM env err m
130 => User -> Either CorpusName [CorpusId]
133 _flowCorpusDebat u n l fp = do
134 docs <- liftBase ( splitEvery 500
137 :: IO [[GD.GrandDebatReference ]]
139 flowCorpus u n (Multi FR) (map (map toHyperdataDocument) docs)
141 flowCorpusFile :: FlowCmdM env err m
142 => User -> Either CorpusName [CorpusId]
143 -> Limit -- Limit the number of docs (for dev purpose)
144 -> TermType Lang -> FileFormat -> FilePath
146 flowCorpusFile u n l la ff fp = do
147 docs <- liftBase ( splitEvery 500
151 flowCorpus u n la (map (map toHyperdataDocument) docs)
153 -- TODO query with complex query
154 flowCorpusSearchInDatabase :: FlowCmdM env err m
159 flowCorpusSearchInDatabase u la q = do
160 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
161 (UserName userMaster)
163 (Nothing :: Maybe HyperdataCorpus)
164 ids <- map fst <$> searchInDatabase cId (stemIt q)
165 flowCorpusUser la u (Left q) (Nothing :: Maybe HyperdataCorpus) ids
169 _flowCorpusSearchInDatabaseApi :: FlowCmdM env err m
174 _flowCorpusSearchInDatabaseApi u la q = do
175 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
176 (UserName userMaster)
178 (Nothing :: Maybe HyperdataCorpus)
179 ids <- map fst <$> searchInDatabase cId (stemIt q)
180 flowCorpusUser la u (Left q) (Nothing :: Maybe HyperdataCorpus) ids
182 ------------------------------------------------------------------------
183 -- | TODO improve the needed type to create/update a corpus
185 data UserInfo = Username Text
187 data CorpusInfo = CorpusName Lang Text
188 | CorpusId Lang NodeId
191 flow :: (FlowCmdM env err m, FlowCorpus a, MkCorpus c)
194 -> Either CorpusName [CorpusId]
198 flow c u cn la docs = do
199 ids <- traverse (insertMasterDocs c la ) docs
200 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
202 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
204 -> Either CorpusName [CorpusId]
208 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
210 ------------------------------------------------------------------------
211 flowCorpusUser :: (FlowCmdM env err m, MkCorpus c)
214 -> Either CorpusName [CorpusId]
218 flowCorpusUser l user corpusName ctype ids = do
220 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
221 listId <- getOrMkList userCorpusId userId
222 _cooc <- mkNode NodeListCooc listId userId
223 -- TODO: check if present already, ignore
224 _ <- Doc.add userCorpusId ids
226 _tId <- mkNode NodeTexts userCorpusId userId
227 -- printDebug "Node Text Id" tId
230 (masterUserId, _masterRootId, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
231 ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
232 _userListId <- flowList_DbRepo listId ngs
233 _mastListId <- getOrMkList masterCorpusId masterUserId
234 -- _ <- insertOccsUpdates userCorpusId mastListId
235 -- printDebug "userListId" userListId
237 _ <- mkDashboard userCorpusId userId
238 _ <- mkGraph userCorpusId userId
239 --_ <- mkPhylo userCorpusId userId
242 -- _ <- mkAnnuaire rootUserId userId
246 insertMasterDocs :: ( FlowCmdM env err m
254 insertMasterDocs c lang hs = do
255 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
257 -- TODO Type NodeDocumentUnicised
258 let docs = map addUniqId hs
259 ids <- insertDb masterUserId masterCorpusId docs
262 documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' docs)
264 -- create a corpus with database name (CSV or PubMed)
265 -- add documents to the corpus (create node_node link)
266 -- this will enable global database monitoring
268 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
269 maps <- mapNodeIdNgrams
270 <$> documentIdWithNgrams (extractNgramsT $ withLang lang documentsWithId) documentsWithId
272 terms2id <- insertNgrams $ Map.keys maps
274 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) maps
277 lId <- getOrMkList masterCorpusId masterUserId
278 mapCgramsId <- listInsertDb lId toNodeNgramsW'
279 $ map (first _ngramsTerms . second Map.keys)
282 _return <- insertNodeNodeNgrams2
283 $ catMaybes [ NodeNodeNgrams2 <$> Just nId
284 <*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms)
285 <*> Just (fromIntegral w :: Double)
286 | (terms, mapNgramsTypes) <- Map.toList maps
287 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
288 , (nId, w) <- Map.toList mapNodeIdWeight
291 _ <- Doc.add masterCorpusId ids'
292 _cooc <- mkNode NodeListCooc lId masterUserId
294 _ <- insertDocNgrams lId indexedNgrams
299 withLang :: HasText a => TermType Lang
300 -> [DocumentWithId a]
302 withLang (Unsupervised l n s m) ns = Unsupervised l n s m'
305 Nothing -> trace ("buildTries here" :: String)
307 $ buildTries n ( fmap toToken $ uniText
308 $ Text.intercalate " . "
317 type CorpusName = Text
319 getOrMkRoot :: (HasNodeError err)
321 -> Cmd err (UserId, RootId)
322 getOrMkRoot user = do
323 userId <- getUserId user
325 rootId' <- map _node_id <$> getRoot user
327 rootId'' <- case rootId' of
329 n -> case length n >= 2 of
330 True -> nodeError ManyNodeUsers
331 False -> pure rootId'
333 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
334 pure (userId, rootId)
337 getOrMk_RootWithCorpus :: (HasNodeError err, MkCorpus a)
339 -> Either CorpusName [CorpusId]
341 -> Cmd err (UserId, RootId, CorpusId)
342 getOrMk_RootWithCorpus user cName c = do
343 (userId, rootId) <- getOrMkRoot user
344 corpusId'' <- if user == UserName userMaster
346 ns <- getCorporaWithParentId rootId
347 pure $ map _node_id ns
349 pure $ fromRight [] cName
351 corpusId' <- if corpusId'' /= []
354 c' <- mk (Just $ fromLeft "Default" cName) c rootId userId
355 _tId <- case head c' of
357 Just c'' -> mkNode NodeTexts c'' userId
360 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
361 pure (userId, rootId, corpusId)
364 ------------------------------------------------------------------------
365 viewUniqId' :: UniqId a
368 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
370 err = panic "[ERROR] Database.Flow.toInsert"
373 toInserted :: [ReturnId]
374 -> Map HashId ReturnId
376 Map.fromList . map (\r -> (reUniqId r, r) )
377 . filter (\r -> reInserted r == True)
379 mergeData :: Map HashId ReturnId
381 -> [DocumentWithId a]
382 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
384 toDocumentWithId (sha,hpd) =
385 DocumentWithId <$> fmap reId (lookup sha rs)
388 ------------------------------------------------------------------------
390 instance HasText HyperdataContact
394 instance ExtractNgramsT HyperdataContact
396 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
398 extract :: TermType Lang -> HyperdataContact
399 -> Cmd err (Map Ngrams (Map NgramsType Int))
401 let authors = map text2ngrams
402 $ maybe ["Nothing"] (\a -> [a])
403 $ view (hc_who . _Just . cw_lastName) hc'
405 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
407 instance HasText HyperdataDocument
409 hasText h = catMaybes [ _hyperdataDocument_title h
410 , _hyperdataDocument_abstract h
413 instance ExtractNgramsT HyperdataDocument
415 extractNgramsT :: TermType Lang
417 -> Cmd err (Map Ngrams (Map NgramsType Int))
418 extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
420 extractNgramsT' :: TermType Lang
422 -> Cmd err (Map Ngrams (Map NgramsType Int))
423 extractNgramsT' lang' doc = do
424 let source = text2ngrams
425 $ maybe "Nothing" identity
426 $ _hyperdataDocument_source doc
428 institutes = map text2ngrams
429 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
430 $ _hyperdataDocument_institutes doc
432 authors = map text2ngrams
433 $ maybe ["Nothing"] (splitOn ", ")
434 $ _hyperdataDocument_authors doc
436 terms' <- map text2ngrams
437 <$> map (intercalate " " . _terms_label)
439 <$> liftBase (extractTerms lang' $ hasText doc)
441 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
442 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
443 <> [(a', Map.singleton Authors 1) | a' <- authors ]
444 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
446 filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
447 -> Map Ngrams (Map NgramsType Int)
448 filterNgramsT s ms = Map.fromList $ map (\a -> filter' s a) $ Map.toList ms
450 filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
452 False -> (Ngrams (Text.take s' t) n , y)
455 documentIdWithNgrams :: HasNodeError err
457 -> Cmd err (Map Ngrams (Map NgramsType Int)))
458 -> [DocumentWithId a]
459 -> Cmd err [DocumentIdWithNgrams a]
460 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
462 toDocumentIdWithNgrams d = do
463 e <- f $ documentData d
464 pure $ DocumentIdWithNgrams d e