2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
20 {-# LANGUAGE ConstraintKinds #-}
21 {-# LANGUAGE RankNTypes #-}
22 {-# LANGUAGE ConstrainedClassMethods #-}
23 {-# LANGUAGE ConstraintKinds #-}
24 {-# LANGUAGE DeriveGeneric #-}
25 {-# LANGUAGE FlexibleContexts #-}
26 {-# LANGUAGE InstanceSigs #-}
27 {-# LANGUAGE NoImplicitPrelude #-}
28 {-# LANGUAGE OverloadedStrings #-}
30 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
34 , flowCorpusSearchInDatabase
36 , getOrMkRootWithCorpus
40 import Prelude (String)
42 import Debug.Trace (trace)
43 import Control.Lens ((^.), view, _Just)
44 import Control.Monad.IO.Class (liftIO)
45 import Data.List (concat)
46 import Data.Map (Map, lookup)
47 import Data.Maybe (Maybe(..), catMaybes)
49 import Data.Text (Text, splitOn, intercalate)
50 import Gargantext.Core (Lang(..))
51 import Gargantext.Core.Types (NodePoly(..), Terms(..))
52 import Gargantext.Core.Types.Individu (Username)
53 import Gargantext.Core.Flow
54 import Gargantext.Core.Types.Main
55 import Gargantext.Database.Config (userMaster, corpusMasterName)
56 import Gargantext.Database.Flow.Utils (insertDocNgrams)
57 import Gargantext.Database.Flow.List
58 import Gargantext.Database.Flow.Types
59 import Gargantext.Database.Node.Contact -- (HyperdataContact(..), ContactWho(..))
60 import Gargantext.Database.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
61 import Gargantext.Database.Root (getRoot)
62 import Gargantext.Database.Schema.Ngrams -- (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
63 import Gargantext.Database.Schema.Node -- (mkRoot, mkCorpus, getOrMkList, mkGraph, {-mkPhylo,-} mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
64 import Gargantext.Database.Schema.User (getUser, UserLight(..))
65 import Gargantext.Database.TextSearch (searchInDatabase)
66 import Gargantext.Database.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
67 import Gargantext.Database.Utils (Cmd)
68 import Gargantext.Ext.IMT (toSchoolName)
69 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
70 import Gargantext.Prelude
71 import Gargantext.Text.Terms.Eleve (buildTries, toToken)
72 import Gargantext.Text.List (buildNgramsLists,StopSize(..))
73 import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat)
74 import qualified Gargantext.Text.Corpus.API.Isidore as Isidore
75 import Gargantext.Text.Terms (TermType(..), tt_lang, extractTerms, uniText)
76 import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
77 import Gargantext.Prelude.Utils hiding (sha)
78 import System.FilePath (FilePath)
79 import qualified Data.List as List
80 import qualified Data.Map as Map
81 import qualified Data.Text as Text
82 import qualified Gargantext.Database.Node.Document.Add as Doc (add)
83 import qualified Gargantext.Text.Corpus.Parsers.GrandDebat as GD
85 ------------------------------------------------------------------------
87 data ApiQuery = ApiIsidoreQuery Text | ApiIsidoreAuth Text
93 -> IO [HyperdataDocument]
94 getDataApi lang limit (ApiIsidoreQuery q) = Isidore.get lang limit (Just q) Nothing
95 getDataApi lang limit (ApiIsidoreAuth q) = Isidore.get lang limit Nothing (Just q)
99 _flowCorpusApi :: ( FlowCmdM env err m)
100 => Username -> Either CorpusName [CorpusId]
105 _flowCorpusApi u n tt l q = do
106 docs <- liftIO $ splitEvery 500 <$> getDataApi (_tt_lang tt) l q
107 flowCorpus u n tt docs
109 ------------------------------------------------------------------------
111 flowAnnuaire :: FlowCmdM env err m
113 -> Either CorpusName [CorpusId]
117 flowAnnuaire u n l filePath = do
118 docs <- liftIO $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
119 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
122 _flowCorpusDebat :: FlowCmdM env err m
123 => Username -> Either CorpusName [CorpusId]
126 _flowCorpusDebat u n l fp = do
127 docs <- liftIO ( splitEvery 500
130 :: IO [[GD.GrandDebatReference ]]
132 flowCorpus u n (Multi FR) (map (map toHyperdataDocument) docs)
134 flowCorpusFile :: FlowCmdM env err m
135 => Username -> Either CorpusName [CorpusId]
136 -> Limit -- Limit the number of docs (for dev purpose)
137 -> TermType Lang -> FileFormat -> FilePath
139 flowCorpusFile u n l la ff fp = do
140 docs <- liftIO ( splitEvery 500
144 flowCorpus u n la (map (map toHyperdataDocument) docs)
146 -- TODO query with complex query
147 flowCorpusSearchInDatabase :: FlowCmdM env err m
152 flowCorpusSearchInDatabase u la q = do
153 (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus
156 (Nothing :: Maybe HyperdataCorpus)
157 ids <- map fst <$> searchInDatabase cId (stemIt q)
158 flowCorpusUser la u (Left q) (Nothing :: Maybe HyperdataCorpus) ids
162 _flowCorpusSearchInDatabaseApi :: FlowCmdM env err m
167 _flowCorpusSearchInDatabaseApi u la q = do
168 (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus
171 (Nothing :: Maybe HyperdataCorpus)
172 ids <- map fst <$> searchInDatabase cId (stemIt q)
173 flowCorpusUser la u (Left q) (Nothing :: Maybe HyperdataCorpus) ids
175 ------------------------------------------------------------------------
176 -- | TODO improve the needed type to create/update a corpus
178 data UserInfo = Username Text
180 data CorpusInfo = CorpusName Lang Text
181 | CorpusId Lang NodeId
184 flow :: (FlowCmdM env err m, FlowCorpus a, MkCorpus c)
187 -> Either CorpusName [CorpusId]
191 flow c u cn la docs = do
192 ids <- mapM (insertMasterDocs c la ) docs
193 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
195 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
197 -> Either CorpusName [CorpusId]
201 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
203 ------------------------------------------------------------------------
204 flowCorpusUser :: (FlowCmdM env err m, MkCorpus c)
207 -> Either CorpusName [CorpusId]
211 flowCorpusUser l userName corpusName ctype ids = do
213 (userId, _rootId, userCorpusId) <- getOrMkRootWithCorpus userName corpusName ctype
214 listId <- getOrMkList userCorpusId userId
215 _cooc <- mkNode NodeListCooc listId userId
216 -- TODO: check if present already, ignore
217 _ <- Doc.add userCorpusId ids
219 _tId <- mkNode NodeTexts userCorpusId userId
220 -- printDebug "Node Text Id" tId
224 (_masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster (Left "") ctype
225 ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
226 _userListId <- flowList listId ngs
227 --mastListId <- getOrMkList masterCorpusId masterUserId
228 -- _ <- insertOccsUpdates userCorpusId mastListId
229 -- printDebug "userListId" userListId
231 _ <- mkDashboard userCorpusId userId
232 _ <- mkGraph userCorpusId userId
233 --_ <- mkPhylo userCorpusId userId
238 -- _ <- mkAnnuaire rootUserId userId
242 insertMasterDocs :: ( FlowCmdM env err m
250 insertMasterDocs c lang hs = do
251 (masterUserId, _, masterCorpusId) <- getOrMkRootWithCorpus userMaster (Left corpusMasterName) c
253 -- TODO Type NodeDocumentUnicised
254 let hs' = map addUniqId hs
255 ids <- insertDb masterUserId masterCorpusId hs'
256 let documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' hs')
259 fixLang (Unsupervised l n s m) = Unsupervised l n s m'
262 Nothing -> trace ("buildTries here" :: String)
264 $ buildTries n ( fmap toToken $ uniText
265 $ Text.intercalate " . "
267 $ map hasText documentsWithId
273 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
274 maps <- mapNodeIdNgrams <$> documentIdWithNgrams (extractNgramsT lang') documentsWithId
275 terms2id <- insertNgrams $ Map.keys maps
276 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) maps
278 lId <- getOrMkList masterCorpusId masterUserId
279 _cooc <- mkNode NodeListCooc lId masterUserId
280 _ <- insertDocNgrams lId indexedNgrams
285 type CorpusName = Text
288 getOrMkRoot :: (HasNodeError err)
290 -> Cmd err (UserId, RootId)
291 getOrMkRoot username = do
292 maybeUserId <- getUser username
293 userId <- case maybeUserId of
294 Nothing -> nodeError NoUserFound
295 Just user -> pure $ userLight_id user
297 rootId' <- map _node_id <$> getRoot username
299 rootId'' <- case rootId' of
300 [] -> mkRoot username userId
301 n -> case length n >= 2 of
302 True -> nodeError ManyNodeUsers
303 False -> pure rootId'
305 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
306 pure (userId, rootId)
309 getOrMkRootWithCorpus :: (HasNodeError err, MkCorpus a)
311 -> Either CorpusName [CorpusId]
313 -> Cmd err (UserId, RootId, CorpusId)
314 getOrMkRootWithCorpus username cName c = do
315 (userId, rootId) <- getOrMkRoot username
316 corpusId'' <- if username == userMaster
318 ns <- getCorporaWithParentId rootId
319 pure $ map _node_id ns
321 pure $ fromRight [] cName
323 corpusId' <- if corpusId'' /= []
325 else mk (Just $ fromLeft "Default" cName) c rootId userId
327 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
329 pure (userId, rootId, corpusId)
332 ------------------------------------------------------------------------
333 viewUniqId' :: UniqId a
336 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
338 err = panic "[ERROR] Database.Flow.toInsert"
341 toInserted :: [ReturnId]
342 -> Map HashId ReturnId
344 Map.fromList . map (\r -> (reUniqId r, r) )
345 . filter (\r -> reInserted r == True)
347 mergeData :: Map HashId ReturnId
349 -> [DocumentWithId a]
350 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
352 toDocumentWithId (sha,hpd) =
353 DocumentWithId <$> fmap reId (lookup sha rs)
356 ------------------------------------------------------------------------
358 instance HasText HyperdataContact
362 instance ExtractNgramsT HyperdataContact
364 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
366 extract :: TermType Lang -> HyperdataContact
367 -> Cmd err (Map Ngrams (Map NgramsType Int))
369 let authors = map text2ngrams
370 $ maybe ["Nothing"] (\a -> [a])
371 $ view (hc_who . _Just . cw_lastName) hc'
373 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
375 instance HasText HyperdataDocument
377 hasText h = catMaybes [ _hyperdataDocument_title h
378 , _hyperdataDocument_abstract h
381 instance ExtractNgramsT HyperdataDocument
383 extractNgramsT :: TermType Lang
385 -> Cmd err (Map Ngrams (Map NgramsType Int))
386 extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
388 extractNgramsT' :: TermType Lang
390 -> Cmd err (Map Ngrams (Map NgramsType Int))
391 extractNgramsT' lang' doc = do
392 let source = text2ngrams
393 $ maybe "Nothing" identity
394 $ _hyperdataDocument_source doc
396 institutes = map text2ngrams
397 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
398 $ _hyperdataDocument_institutes doc
400 authors = map text2ngrams
401 $ maybe ["Nothing"] (splitOn ", ")
402 $ _hyperdataDocument_authors doc
404 terms' <- map text2ngrams
405 <$> map (intercalate " " . _terms_label)
407 <$> liftIO (extractTerms lang' $ hasText doc)
409 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
410 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
411 <> [(a', Map.singleton Authors 1) | a' <- authors ]
412 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
414 filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
415 -> Map Ngrams (Map NgramsType Int)
416 filterNgramsT s ms = Map.fromList $ map (\a -> filter' s a) $ Map.toList ms
418 filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
420 False -> (Ngrams (Text.take s' t) n , y)
423 documentIdWithNgrams :: HasNodeError err
425 -> Cmd err (Map Ngrams (Map NgramsType Int)))
426 -> [DocumentWithId a]
427 -> Cmd err [DocumentIdWithNgrams a]
428 documentIdWithNgrams f = mapM toDocumentIdWithNgrams
430 toDocumentIdWithNgrams d = do
431 e <- f $ documentData d
432 pure $ DocumentIdWithNgrams d e