2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
20 {-# LANGUAGE ConstraintKinds #-}
21 {-# LANGUAGE RankNTypes #-}
22 {-# LANGUAGE ConstrainedClassMethods #-}
23 {-# LANGUAGE ConstraintKinds #-}
24 {-# LANGUAGE DeriveGeneric #-}
25 {-# LANGUAGE FlexibleContexts #-}
26 {-# LANGUAGE InstanceSigs #-}
27 {-# LANGUAGE NoImplicitPrelude #-}
28 {-# LANGUAGE OverloadedStrings #-}
30 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
34 , flowCorpusSearchInDatabase
36 , getOrMk_RootWithCorpus
41 import Prelude (String)
43 import Data.Tuple.Extra (first, second)
44 import Data.Traversable (traverse)
45 import Debug.Trace (trace)
46 import Control.Lens ((^.), view, _Just)
47 import Control.Monad.IO.Class (liftIO)
48 import Data.List (concat)
49 import Data.Map (Map, lookup)
50 import Data.Maybe (Maybe(..), catMaybes)
52 import Data.Text (Text, splitOn, intercalate)
53 import Gargantext.Core (Lang(..))
54 import Gargantext.Core.Types (NodePoly(..), Terms(..))
55 import Gargantext.Core.Types.Individu (Username)
56 import Gargantext.Core.Flow.Types
57 import Gargantext.Core.Types.Main
58 import Gargantext.Database.Config (userMaster, corpusMasterName)
59 import Gargantext.Database.Flow.Utils (insertDocNgrams)
60 import Gargantext.Database.Flow.List
61 import Gargantext.Database.Flow.Types
62 import Gargantext.Database.Node.Contact -- (HyperdataContact(..), ContactWho(..))
63 import Gargantext.Database.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
64 import Gargantext.Database.Root (getRoot)
66 import Gargantext.Database.Schema.Ngrams -- (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
67 import Gargantext.Database.Schema.Node -- (mkRoot, mkCorpus, getOrMkList, mkGraph, {-mkPhylo,-} mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
68 import Gargantext.Database.Schema.NodeNgrams (listInsertDb , getCgramsId)
69 import Gargantext.Database.Schema.NodeNodeNgrams2 -- (NodeNodeNgrams2, insertNodeNodeNgrams2)
70 import Gargantext.Database.Schema.User (getUser, UserLight(..))
71 import Gargantext.Database.TextSearch (searchInDatabase)
72 import Gargantext.Database.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
73 import Gargantext.Database.Utils (Cmd)
74 import Gargantext.Ext.IMT (toSchoolName)
75 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
76 import Gargantext.Prelude
77 import Gargantext.Text.Terms.Eleve (buildTries, toToken)
78 import Gargantext.Text.List (buildNgramsLists,StopSize(..))
79 import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat)
80 import qualified Gargantext.Text.Corpus.API.Isidore as Isidore
81 import Gargantext.Text.Terms (TermType(..), tt_lang, extractTerms, uniText)
82 import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
83 import Gargantext.Prelude.Utils hiding (sha)
84 import System.FilePath (FilePath)
85 import qualified Data.List as List
86 import qualified Data.Map as Map
87 import qualified Data.Text as Text
88 import qualified Gargantext.Database.Node.Document.Add as Doc (add)
89 import qualified Gargantext.Text.Corpus.Parsers.GrandDebat as GD
91 ------------------------------------------------------------------------
93 data ApiQuery = ApiIsidoreQuery Text | ApiIsidoreAuth Text
99 -> IO [HyperdataDocument]
100 getDataApi lang limit (ApiIsidoreQuery q) = Isidore.get lang limit (Just q) Nothing
101 getDataApi lang limit (ApiIsidoreAuth q) = Isidore.get lang limit Nothing (Just q)
105 _flowCorpusApi :: ( FlowCmdM env err m)
106 => Username -> Either CorpusName [CorpusId]
111 _flowCorpusApi u n tt l q = do
112 docs <- liftIO $ splitEvery 500 <$> getDataApi (_tt_lang tt) l q
113 flowCorpus u n tt docs
115 ------------------------------------------------------------------------
117 flowAnnuaire :: FlowCmdM env err m
119 -> Either CorpusName [CorpusId]
123 flowAnnuaire u n l filePath = do
124 docs <- liftIO $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
125 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
128 _flowCorpusDebat :: FlowCmdM env err m
129 => Username -> Either CorpusName [CorpusId]
132 _flowCorpusDebat u n l fp = do
133 docs <- liftIO ( splitEvery 500
136 :: IO [[GD.GrandDebatReference ]]
138 flowCorpus u n (Multi FR) (map (map toHyperdataDocument) docs)
140 flowCorpusFile :: FlowCmdM env err m
141 => Username -> Either CorpusName [CorpusId]
142 -> Limit -- Limit the number of docs (for dev purpose)
143 -> TermType Lang -> FileFormat -> FilePath
145 flowCorpusFile u n l la ff fp = do
146 docs <- liftIO ( splitEvery 500
150 flowCorpus u n la (map (map toHyperdataDocument) docs)
152 -- TODO query with complex query
153 flowCorpusSearchInDatabase :: FlowCmdM env err m
158 flowCorpusSearchInDatabase u la q = do
159 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
162 (Nothing :: Maybe HyperdataCorpus)
163 ids <- map fst <$> searchInDatabase cId (stemIt q)
164 flowCorpusUser la u (Left q) (Nothing :: Maybe HyperdataCorpus) ids
168 _flowCorpusSearchInDatabaseApi :: FlowCmdM env err m
173 _flowCorpusSearchInDatabaseApi u la q = do
174 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
177 (Nothing :: Maybe HyperdataCorpus)
178 ids <- map fst <$> searchInDatabase cId (stemIt q)
179 flowCorpusUser la u (Left q) (Nothing :: Maybe HyperdataCorpus) ids
181 ------------------------------------------------------------------------
182 -- | TODO improve the needed type to create/update a corpus
184 data UserInfo = Username Text
186 data CorpusInfo = CorpusName Lang Text
187 | CorpusId Lang NodeId
190 flow :: (FlowCmdM env err m, FlowCorpus a, MkCorpus c)
193 -> Either CorpusName [CorpusId]
197 flow c u cn la docs = do
198 ids <- traverse (insertMasterDocs c la ) docs
199 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
201 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
203 -> Either CorpusName [CorpusId]
207 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
209 ------------------------------------------------------------------------
210 flowCorpusUser :: (FlowCmdM env err m, MkCorpus c)
213 -> Either CorpusName [CorpusId]
217 flowCorpusUser l userName corpusName ctype ids = do
219 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus userName corpusName ctype
220 listId <- getOrMkList userCorpusId userId
221 _cooc <- mkNode NodeListCooc listId userId
222 -- TODO: check if present already, ignore
223 _ <- Doc.add userCorpusId ids
225 _tId <- mkNode NodeTexts userCorpusId userId
226 -- printDebug "Node Text Id" tId
229 (masterUserId, _masterRootId, masterCorpusId) <- getOrMk_RootWithCorpus userMaster (Left "") ctype
230 ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
231 _userListId <- flowList_DbRepo listId ngs
232 _mastListId <- getOrMkList masterCorpusId masterUserId
233 -- _ <- insertOccsUpdates userCorpusId mastListId
234 -- printDebug "userListId" userListId
236 _ <- mkDashboard userCorpusId userId
237 _ <- mkGraph userCorpusId userId
238 --_ <- mkPhylo userCorpusId userId
241 -- _ <- mkAnnuaire rootUserId userId
245 insertMasterDocs :: ( FlowCmdM env err m
253 insertMasterDocs c lang hs = do
254 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus userMaster (Left corpusMasterName) c
256 -- TODO Type NodeDocumentUnicised
257 let docs = map addUniqId hs
258 ids <- insertDb masterUserId masterCorpusId docs
261 documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' docs)
263 -- create a corpus with database name (CSV or PubMed)
264 -- add documents to the corpus (create node_node link)
265 -- this will enable global database monitoring
267 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
268 maps <- mapNodeIdNgrams
269 <$> documentIdWithNgrams (extractNgramsT $ withLang lang documentsWithId) documentsWithId
271 terms2id <- insertNgrams $ Map.keys maps
273 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) maps
276 lId <- getOrMkList masterCorpusId masterUserId
277 mapCgramsId <- listInsertDb lId toNodeNgramsW'
278 $ map (first _ngramsTerms . second Map.keys)
281 _return <- insertNodeNodeNgrams2
282 $ catMaybes [ NodeNodeNgrams2 <$> Just nId
283 <*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms)
284 <*> Just (fromIntegral w :: Double)
285 | (terms, mapNgramsTypes) <- Map.toList maps
286 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
287 , (nId, w) <- Map.toList mapNodeIdWeight
290 _ <- Doc.add masterCorpusId ids'
291 _cooc <- mkNode NodeListCooc lId masterUserId
293 _ <- insertDocNgrams lId indexedNgrams
298 withLang :: HasText a => TermType Lang
299 -> [DocumentWithId a]
301 withLang (Unsupervised l n s m) ns = Unsupervised l n s m'
304 Nothing -> trace ("buildTries here" :: String)
306 $ buildTries n ( fmap toToken $ uniText
307 $ Text.intercalate " . "
316 type CorpusName = Text
319 getOrMkRoot :: (HasNodeError err)
321 -> Cmd err (UserId, RootId)
322 getOrMkRoot username = do
323 maybeUserId <- getUser username
324 userId <- case maybeUserId of
325 Nothing -> nodeError NoUserFound
326 Just user -> pure $ userLight_id user
328 rootId' <- map _node_id <$> getRoot username
330 rootId'' <- case rootId' of
331 [] -> mkRoot username userId
332 n -> case length n >= 2 of
333 True -> nodeError ManyNodeUsers
334 False -> pure rootId'
336 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
337 pure (userId, rootId)
340 getOrMk_RootWithCorpus :: (HasNodeError err, MkCorpus a)
342 -> Either CorpusName [CorpusId]
344 -> Cmd err (UserId, RootId, CorpusId)
345 getOrMk_RootWithCorpus username cName c = do
346 (userId, rootId) <- getOrMkRoot username
347 corpusId'' <- if username == userMaster
349 ns <- getCorporaWithParentId rootId
350 pure $ map _node_id ns
352 pure $ fromRight [] cName
354 corpusId' <- if corpusId'' /= []
357 c' <- mk (Just $ fromLeft "Default" cName) c rootId userId
358 _tId <- case head c' of
360 Just c'' -> mkNode NodeTexts c'' userId
363 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
364 pure (userId, rootId, corpusId)
367 ------------------------------------------------------------------------
368 viewUniqId' :: UniqId a
371 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
373 err = panic "[ERROR] Database.Flow.toInsert"
376 toInserted :: [ReturnId]
377 -> Map HashId ReturnId
379 Map.fromList . map (\r -> (reUniqId r, r) )
380 . filter (\r -> reInserted r == True)
382 mergeData :: Map HashId ReturnId
384 -> [DocumentWithId a]
385 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
387 toDocumentWithId (sha,hpd) =
388 DocumentWithId <$> fmap reId (lookup sha rs)
391 ------------------------------------------------------------------------
393 instance HasText HyperdataContact
397 instance ExtractNgramsT HyperdataContact
399 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
401 extract :: TermType Lang -> HyperdataContact
402 -> Cmd err (Map Ngrams (Map NgramsType Int))
404 let authors = map text2ngrams
405 $ maybe ["Nothing"] (\a -> [a])
406 $ view (hc_who . _Just . cw_lastName) hc'
408 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
410 instance HasText HyperdataDocument
412 hasText h = catMaybes [ _hyperdataDocument_title h
413 , _hyperdataDocument_abstract h
416 instance ExtractNgramsT HyperdataDocument
418 extractNgramsT :: TermType Lang
420 -> Cmd err (Map Ngrams (Map NgramsType Int))
421 extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
423 extractNgramsT' :: TermType Lang
425 -> Cmd err (Map Ngrams (Map NgramsType Int))
426 extractNgramsT' lang' doc = do
427 let source = text2ngrams
428 $ maybe "Nothing" identity
429 $ _hyperdataDocument_source doc
431 institutes = map text2ngrams
432 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
433 $ _hyperdataDocument_institutes doc
435 authors = map text2ngrams
436 $ maybe ["Nothing"] (splitOn ", ")
437 $ _hyperdataDocument_authors doc
439 terms' <- map text2ngrams
440 <$> map (intercalate " " . _terms_label)
442 <$> liftIO (extractTerms lang' $ hasText doc)
444 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
445 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
446 <> [(a', Map.singleton Authors 1) | a' <- authors ]
447 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
449 filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
450 -> Map Ngrams (Map NgramsType Int)
451 filterNgramsT s ms = Map.fromList $ map (\a -> filter' s a) $ Map.toList ms
453 filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
455 False -> (Ngrams (Text.take s' t) n , y)
458 documentIdWithNgrams :: HasNodeError err
460 -> Cmd err (Map Ngrams (Map NgramsType Int)))
461 -> [DocumentWithId a]
462 -> Cmd err [DocumentIdWithNgrams a]
463 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
465 toDocumentIdWithNgrams d = do
466 e <- f $ documentData d
467 pure $ DocumentIdWithNgrams d e