2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 -- check userId CanFillUserCorpus userCorpusId
13 -- check masterUserId CanFillMasterCorpus masterCorpusId
15 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
16 -- TODO-EVENTS: InsertedNodes
21 {-# OPTIONS_GHC -fno-warn-orphans #-}
23 {-# LANGUAGE ConstraintKinds #-}
24 {-# LANGUAGE RankNTypes #-}
25 {-# LANGUAGE ConstrainedClassMethods #-}
26 {-# LANGUAGE ConstraintKinds #-}
27 {-# LANGUAGE DeriveGeneric #-}
28 {-# LANGUAGE FlexibleContexts #-}
29 {-# LANGUAGE InstanceSigs #-}
30 {-# LANGUAGE NoImplicitPrelude #-}
31 {-# LANGUAGE OverloadedStrings #-}
33 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
37 , flowCorpusSearchInDatabase
39 , getOrMkRootWithCorpus
42 import Prelude (String)
44 import Debug.Trace (trace)
45 import Control.Lens ((^.), view, _Just)
46 import Control.Monad (mapM_)
47 import Control.Monad.IO.Class (liftIO)
48 import Data.List (concat)
49 import Data.Map (Map, lookup, toList)
50 import Data.Maybe (Maybe(..), catMaybes)
52 import Data.Text (Text, splitOn, intercalate)
53 import GHC.Show (Show)
54 import Gargantext.API.Ngrams (HasRepoVar)
55 import Gargantext.API.Ngrams (NgramsElement, putListNgrams, RepoCmdM)
56 import Gargantext.Core (Lang(..))
57 import Gargantext.Core.Types (NodePoly(..), Terms(..))
58 import Gargantext.Core.Types.Individu (Username)
59 import Gargantext.Core.Flow
60 import Gargantext.Core.Types.Main
61 import Gargantext.Database.Config (userMaster, corpusMasterName)
62 import Gargantext.Database.Flow.Utils (insertDocNgrams)
63 import Gargantext.Database.Node.Contact -- (HyperdataContact(..), ContactWho(..))
64 import Gargantext.Database.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
65 import Gargantext.Database.Root (getRoot)
66 import Gargantext.Database.Schema.Ngrams -- (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
67 import Gargantext.Database.Schema.Node -- (mkRoot, mkCorpus, getOrMkList, mkGraph, {-mkPhylo,-} mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
68 import Gargantext.Database.Schema.User (getUser, UserLight(..))
69 import Gargantext.Database.TextSearch (searchInDatabase)
70 import Gargantext.Database.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
71 import Gargantext.Database.Utils (Cmd, CmdM)
72 import Gargantext.Ext.IMT (toSchoolName)
73 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
74 import Gargantext.Prelude
75 import Gargantext.Text.Terms.Eleve (buildTries, toToken)
76 import Gargantext.Text.List (buildNgramsLists,StopSize(..))
77 import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat)
78 import qualified Gargantext.Text.Corpus.API.Isidore as Isidore
79 import Gargantext.Text.Terms (TermType(..), tt_lang, extractTerms, uniText)
80 import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
81 import Gargantext.Prelude.Utils hiding (hash)
82 import System.FilePath (FilePath)
83 import qualified Data.List as List
84 import qualified Data.Map as Map
85 import qualified Data.Text as Text
86 import qualified Gargantext.Database.Node.Document.Add as Doc (add)
87 import qualified Gargantext.Text.Corpus.Parsers.GrandDebat as GD
89 type FlowCmdM env err m =
96 ------------------------------------------------------------------------
98 data ApiQuery = ApiIsidoreQuery Text | ApiIsidoreAuth Text
104 -> IO [HyperdataDocument]
105 getDataApi lang limit (ApiIsidoreQuery q) = Isidore.get lang limit (Just q) Nothing
106 getDataApi lang limit (ApiIsidoreAuth q) = Isidore.get lang limit Nothing (Just q)
110 _flowCorpusApi :: ( FlowCmdM env err m)
111 => Username -> Either CorpusName [CorpusId]
116 _flowCorpusApi u n tt l q = do
117 docs <- liftIO $ splitEvery 500 <$> getDataApi (_tt_lang tt) l q
118 flowCorpus u n tt docs
120 ------------------------------------------------------------------------
123 _flowAnnuaire :: FlowCmdM env err m
124 => Username -> Either CorpusName [CorpusId] -> (TermType Lang) -> FilePath -> m AnnuaireId
125 _flowAnnuaire u n l filePath = do
126 docs <- liftIO $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
127 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
130 _flowCorpusDebat :: FlowCmdM env err m
131 => Username -> Either CorpusName [CorpusId]
134 _flowCorpusDebat u n l fp = do
135 docs <- liftIO ( splitEvery 500
138 :: IO [[GD.GrandDebatReference ]]
140 flowCorpus u n (Multi FR) (map (map toHyperdataDocument) docs)
142 flowCorpusFile :: FlowCmdM env err m
143 => Username -> Either CorpusName [CorpusId]
144 -> Limit -- Limit the number of docs (for dev purpose)
145 -> TermType Lang -> FileFormat -> FilePath
147 flowCorpusFile u n l la ff fp = do
148 docs <- liftIO ( splitEvery 500
152 flowCorpus u n la (map (map toHyperdataDocument) docs)
154 -- TODO query with complex query
155 flowCorpusSearchInDatabase :: FlowCmdM env err m
156 => Username -> Lang -> Text -> m CorpusId
157 flowCorpusSearchInDatabase u la q = do
158 (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus userMaster (Left "") (Nothing :: Maybe HyperdataCorpus)
159 ids <- map fst <$> searchInDatabase cId (stemIt q)
160 flowCorpusUser la u (Left q) (Nothing :: Maybe HyperdataCorpus) ids
164 _flowCorpusSearchInDatabaseApi :: FlowCmdM env err m
165 => Username -> Lang -> Text -> m CorpusId
166 _flowCorpusSearchInDatabaseApi u la q = do
167 (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus userMaster (Left "") (Nothing :: Maybe HyperdataCorpus)
168 ids <- map fst <$> searchInDatabase cId (stemIt q)
169 flowCorpusUser la u (Left q) (Nothing :: Maybe HyperdataCorpus) ids
171 ------------------------------------------------------------------------
172 -- | TODO improve the needed type to create/update a corpus
174 data UserInfo = Username Text
176 data CorpusInfo = CorpusName Lang Text
177 | CorpusId Lang NodeId
181 flow :: (FlowCmdM env err m, FlowCorpus a, MkCorpus c)
182 => Maybe c -> Username -> Either CorpusName [CorpusId] -> TermType Lang -> [[a]] -> m CorpusId
183 flow c u cn la docs = do
184 ids <- mapM (insertMasterDocs c la ) docs
185 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
187 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
188 => Username -> Either CorpusName [CorpusId] -> TermType Lang -> [[a]] -> m CorpusId
189 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
191 ------------------------------------------------------------------------
192 flowCorpusUser :: (FlowCmdM env err m, MkCorpus c)
193 => Lang -> Username -> Either CorpusName [CorpusId] -> Maybe c -> [NodeId] -> m CorpusId
194 flowCorpusUser l userName corpusName ctype ids = do
196 (userId, _rootId, userCorpusId) <- getOrMkRootWithCorpus userName corpusName ctype
197 -- TODO: check if present already, ignore
198 _ <- Doc.add userCorpusId ids
199 tId <- mkNode NodeTexts userCorpusId userId
201 printDebug "Node Text Id" tId
205 (_masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster (Left "") ctype
206 ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
207 userListId <- flowList userId userCorpusId ngs
208 printDebug "userListId" userListId
210 _ <- mkDashboard userCorpusId userId
211 _ <- mkGraph userCorpusId userId
212 --_ <- mkPhylo userCorpusId userId
217 -- _ <- mkAnnuaire rootUserId userId
221 insertMasterDocs :: ( FlowCmdM env err m
225 => Maybe c -> TermType Lang -> [a] -> m [DocId]
226 insertMasterDocs c lang hs = do
227 (masterUserId, _, masterCorpusId) <- getOrMkRootWithCorpus userMaster (Left corpusMasterName) c
229 -- TODO Type NodeDocumentUnicised
230 let hs' = map addUniqId hs
231 ids <- insertDb masterUserId masterCorpusId hs'
232 let documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' hs')
235 fixLang (Unsupervised l n s m) = Unsupervised l n s m'
238 Nothing -> trace ("buildTries here" :: String)
240 $ buildTries n ( fmap toToken $ uniText
241 $ Text.intercalate " . "
243 $ map hasText documentsWithId
249 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
250 maps <- mapNodeIdNgrams <$> documentIdWithNgrams (extractNgramsT lang') documentsWithId
251 terms2id <- insertNgrams $ Map.keys maps
252 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) maps
254 lId <- getOrMkList masterCorpusId masterUserId
255 _ <- insertDocNgrams lId indexedNgrams
259 type CorpusName = Text
262 getOrMkRoot :: (HasNodeError err) => Username -> Cmd err (UserId, RootId)
263 getOrMkRoot username = do
264 maybeUserId <- getUser username
265 userId <- case maybeUserId of
266 Nothing -> nodeError NoUserFound
267 Just user -> pure $ userLight_id user
269 rootId' <- map _node_id <$> getRoot username
271 rootId'' <- case rootId' of
272 [] -> mkRoot username userId
273 n -> case length n >= 2 of
274 True -> nodeError ManyNodeUsers
275 False -> pure rootId'
277 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
278 pure (userId, rootId)
281 getOrMkRootWithCorpus :: (HasNodeError err, MkCorpus a)
282 => Username -> Either CorpusName [CorpusId] -> Maybe a
283 -> Cmd err (UserId, RootId, CorpusId)
284 getOrMkRootWithCorpus username cName c = do
285 (userId, rootId) <- getOrMkRoot username
286 corpusId'' <- if username == userMaster
288 ns <- getCorporaWithParentId rootId
289 pure $ map _node_id ns
291 pure $ fromRight [] cName
293 corpusId' <- if corpusId'' /= []
295 else mk (Just $ fromLeft "Default" cName) c rootId userId
297 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
299 pure (userId, rootId, corpusId)
302 ------------------------------------------------------------------------
303 viewUniqId' :: UniqId a => a -> (HashId, a)
304 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
306 err = panic "[ERROR] Database.Flow.toInsert"
309 toInserted :: [ReturnId] -> Map HashId ReturnId
310 toInserted = Map.fromList . map (\r -> (reUniqId r, r) )
311 . filter (\r -> reInserted r == True)
313 data DocumentWithId a = DocumentWithId
314 { documentId :: !NodeId
318 instance HasText a => HasText (DocumentWithId a)
320 hasText (DocumentWithId _ a) = hasText a
322 mergeData :: Map HashId ReturnId
324 -> [DocumentWithId a]
325 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
327 toDocumentWithId (hash,hpd) =
328 DocumentWithId <$> fmap reId (lookup hash rs)
331 ------------------------------------------------------------------------
332 data DocumentIdWithNgrams a = DocumentIdWithNgrams
333 { documentWithId :: !(DocumentWithId a)
334 , document_ngrams :: !(Map Ngrams (Map NgramsType Int))
338 instance HasText HyperdataContact
342 instance ExtractNgramsT HyperdataContact
344 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
346 extract :: TermType Lang -> HyperdataContact
347 -> Cmd err (Map Ngrams (Map NgramsType Int))
349 let authors = map text2ngrams
350 $ maybe ["Nothing"] (\a -> [a])
351 $ view (hc_who . _Just . cw_lastName) hc'
353 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
355 instance HasText HyperdataDocument
357 hasText h = catMaybes [ _hyperdataDocument_title h
358 , _hyperdataDocument_abstract h
361 instance ExtractNgramsT HyperdataDocument
363 extractNgramsT :: TermType Lang -> HyperdataDocument -> Cmd err (Map Ngrams (Map NgramsType Int))
364 extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
366 extractNgramsT' :: TermType Lang -> HyperdataDocument
367 -> Cmd err (Map Ngrams (Map NgramsType Int))
368 extractNgramsT' lang' doc = do
369 let source = text2ngrams
370 $ maybe "Nothing" identity
371 $ _hyperdataDocument_source doc
373 institutes = map text2ngrams
374 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
375 $ _hyperdataDocument_institutes doc
377 authors = map text2ngrams
378 $ maybe ["Nothing"] (splitOn ", ")
379 $ _hyperdataDocument_authors doc
381 terms' <- map text2ngrams
382 <$> map (intercalate " " . _terms_label)
384 <$> liftIO (extractTerms lang' $ hasText doc)
386 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
387 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
388 <> [(a', Map.singleton Authors 1) | a' <- authors ]
389 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
392 filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
393 -> Map Ngrams (Map NgramsType Int)
394 filterNgramsT s ms = Map.fromList $ map (\a -> filter' s a) $ Map.toList ms
396 filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
398 False -> (Ngrams (Text.take s' t) n , y)
401 documentIdWithNgrams :: HasNodeError err
403 -> Cmd err (Map Ngrams (Map NgramsType Int)))
404 -> [DocumentWithId a]
405 -> Cmd err [DocumentIdWithNgrams a]
406 documentIdWithNgrams f = mapM toDocumentIdWithNgrams
408 toDocumentIdWithNgrams d = do
409 e <- f $ documentData d
410 pure $ DocumentIdWithNgrams d e
414 -- | TODO check optimization
415 mapNodeIdNgrams :: [DocumentIdWithNgrams a]
416 -> Map Ngrams (Map NgramsType (Map NodeId Int))
417 mapNodeIdNgrams = Map.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
419 f :: DocumentIdWithNgrams a
420 -> Map Ngrams (Map NgramsType (Map NodeId Int))
421 f d = fmap (fmap (Map.singleton nId)) $ document_ngrams d
423 nId = documentId $ documentWithId d
425 ------------------------------------------------------------------------
426 listInsert :: FlowCmdM env err m
427 => ListId -> Map NgramsType [NgramsElement]
429 listInsert lId ngs = mapM_ (\(typeList, ngElmts)
430 -> putListNgrams lId typeList ngElmts
433 flowList :: FlowCmdM env err m => UserId -> CorpusId
434 -> Map NgramsType [NgramsElement]
436 flowList uId cId ngs = do
437 lId <- getOrMkList cId uId
438 printDebug "listId flowList" lId
440 --trace (show $ List.filter (\n -> _ne_ngrams n == "versatile") $ List.concat $ Map.elems ngs) $ listInsert lId ngs