2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 -- check userId CanFillUserCorpus userCorpusId
13 -- check masterUserId CanFillMasterCorpus masterCorpusId
15 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
16 -- TODO-EVENTS: InsertedNodes
21 {-# OPTIONS_GHC -fno-warn-orphans #-}
23 {-# LANGUAGE ConstraintKinds #-}
24 {-# LANGUAGE RankNTypes #-}
25 {-# LANGUAGE ConstrainedClassMethods #-}
26 {-# LANGUAGE ConstraintKinds #-}
27 {-# LANGUAGE DeriveGeneric #-}
28 {-# LANGUAGE FlexibleContexts #-}
29 {-# LANGUAGE InstanceSigs #-}
30 {-# LANGUAGE NoImplicitPrelude #-}
31 {-# LANGUAGE OverloadedStrings #-}
33 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
35 import Prelude (String)
36 import Debug.Trace (trace)
37 import Control.Lens ((^.), view, _Just)
38 import Control.Monad (mapM_)
39 import Control.Monad.IO.Class (liftIO)
40 import Data.List (concat)
41 import Data.Map (Map, lookup, toList)
42 import Data.Maybe (Maybe(..), catMaybes)
44 import Data.Text (Text, splitOn, intercalate)
45 import GHC.Show (Show)
46 import Gargantext.API.Ngrams (HasRepoVar)
47 import Gargantext.API.Ngrams (NgramsElement(..), putListNgrams, RepoCmdM)
48 import Gargantext.Core (Lang(..))
49 import Gargantext.Core.Types (NodePoly(..), Terms(..))
50 import Gargantext.Core.Types.Individu (Username)
51 import Gargantext.Core.Flow
52 import Gargantext.Core.Types.Main
53 import Gargantext.Database.Config (userMaster, corpusMasterName)
54 import Gargantext.Database.Flow.Utils (insertDocNgrams)
55 import Gargantext.Database.Node.Contact -- (HyperdataContact(..), ContactWho(..))
56 import Gargantext.Database.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
57 import Gargantext.Database.Root (getRoot)
58 import Gargantext.Database.Schema.Ngrams -- (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
59 import Gargantext.Database.Schema.Node -- (mkRoot, mkCorpus, getOrMkList, mkGraph, mkPhylo, mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
60 import Gargantext.Database.Schema.User (getUser, UserLight(..))
61 import Gargantext.Database.TextSearch (searchInDatabase)
62 import Gargantext.Database.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
63 import Gargantext.Database.Utils (Cmd, CmdM)
64 import Gargantext.Ext.IMT (toSchoolName)
65 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
66 import Gargantext.Prelude
67 import Gargantext.Text.Terms.Eleve (buildTries, toToken)
68 import Gargantext.Text.List (buildNgramsLists,StopSize(..))
69 import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat)
70 import qualified Gargantext.Text.Corpus.API.Isidore as Isidore
71 import Gargantext.Text.Terms (TermType(..), tt_lang, extractTerms, uniText)
72 import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
73 import Servant (ServantErr)
74 import System.FilePath (FilePath)
75 import qualified Data.List as List
76 import qualified Data.Map as Map
77 import qualified Data.Text as Text
78 import qualified Gargantext.Database.Node.Document.Add as Doc (add)
79 import qualified Gargantext.Text.Corpus.Parsers.GrandDebat as GD
81 type FlowCmdM env err m =
88 ------------------------------------------------------------------------
90 data ApiQuery = ApiIsidoreQuery Text | ApiIsidoreAuth Text
96 -> IO [HyperdataDocument]
97 getDataApi lang limit (ApiIsidoreQuery q) = Isidore.get lang limit (Just q) Nothing
98 getDataApi lang limit (ApiIsidoreAuth q) = Isidore.get lang limit Nothing (Just q)
101 flowCorpusApi :: ( FlowCmdM env ServantErr m)
102 => Username -> CorpusName
107 flowCorpusApi u n tt l q = do
108 docs <- liftIO $ splitEvery 500 <$> getDataApi (_tt_lang tt) l q
109 flowCorpus u n tt docs
111 ------------------------------------------------------------------------
113 flowAnnuaire :: FlowCmdM env ServantErr m
114 => Username -> CorpusName -> (TermType Lang) -> FilePath -> m AnnuaireId
115 flowAnnuaire u n l filePath = do
116 docs <- liftIO $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
117 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
120 flowCorpusDebat :: FlowCmdM env ServantErr m
121 => Username -> CorpusName
124 flowCorpusDebat u n l fp = do
125 docs <- liftIO ( splitEvery 500
128 :: IO [[GD.GrandDebatReference ]]
130 flowCorpus u n (Multi FR) (map (map toHyperdataDocument) docs)
132 flowCorpusFile :: FlowCmdM env ServantErr m
133 => Username -> CorpusName
134 -> Limit -- Limit the number of docs (for dev purpose)
135 -> TermType Lang -> FileFormat -> FilePath
137 flowCorpusFile u n l la ff fp = do
138 docs <- liftIO ( splitEvery 500
142 flowCorpus u n la (map (map toHyperdataDocument) docs)
144 -- TODO query with complex query
145 flowCorpusSearchInDatabase :: FlowCmdM env err m
146 => Username -> Lang -> Text -> m CorpusId
147 flowCorpusSearchInDatabase u la q = do
148 (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus userMaster "" (Nothing :: Maybe HyperdataCorpus)
149 ids <- map fst <$> searchInDatabase cId (stemIt q)
150 flowCorpusUser la u q (Nothing :: Maybe HyperdataCorpus) ids
153 flowCorpusSearchInDatabaseApi :: FlowCmdM env ServantErr m
154 => Username -> Lang -> Text -> m CorpusId
155 flowCorpusSearchInDatabaseApi u la q = do
156 (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus userMaster "" (Nothing :: Maybe HyperdataCorpus)
157 ids <- map fst <$> searchInDatabase cId (stemIt q)
158 flowCorpusUser la u q (Nothing :: Maybe HyperdataCorpus) ids
160 ------------------------------------------------------------------------
161 -- | TODO improve the needed type to create/update a corpus
162 data UserInfo = Username Text
164 data CorpusInfo = CorpusName Lang Text
165 | CorpusId Lang NodeId
168 flow :: (FlowCmdM env err m, FlowCorpus a, MkCorpus c)
169 => Maybe c -> Username -> CorpusName -> TermType Lang -> [[a]] -> m CorpusId
170 flow c u cn la docs = do
171 ids <- mapM (insertMasterDocs c la ) docs
172 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
174 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
175 => Username -> CorpusName -> TermType Lang -> [[a]] -> m CorpusId
176 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
178 ------------------------------------------------------------------------
181 flowCorpusUser :: (FlowCmdM env err m, MkCorpus c)
182 => Lang -> Username -> CorpusName -> Maybe c -> [NodeId] -> m CorpusId
183 flowCorpusUser l userName corpusName ctype ids = do
185 (userId, _rootId, userCorpusId) <- getOrMkRootWithCorpus userName corpusName ctype
186 -- TODO: check if present already, ignore
187 _ <- Doc.add userCorpusId ids
191 (_masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster "" ctype
192 ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
193 userListId <- flowList userId userCorpusId ngs
194 printDebug "userListId" userListId
196 _ <- mkGraph userCorpusId userId
197 _ <- mkPhylo userCorpusId userId
200 -- User Dashboard Flow
201 _ <- mkDashboard userCorpusId userId
204 -- _ <- mkAnnuaire rootUserId userId
208 insertMasterDocs :: ( FlowCmdM env err m
212 => Maybe c -> TermType Lang -> [a] -> m [DocId]
213 insertMasterDocs c lang hs = do
214 (masterUserId, _, masterCorpusId) <- getOrMkRootWithCorpus userMaster corpusMasterName c
216 -- TODO Type NodeDocumentUnicised
217 let hs' = map addUniqId hs
218 ids <- insertDb masterUserId masterCorpusId hs'
219 let documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' hs')
222 fixLang (Unsupervised l n s m) = Unsupervised l n s m'
225 Nothing -> trace ("buildTries here" :: String)
227 $ buildTries n ( fmap toToken $ uniText
228 $ Text.intercalate " . "
230 $ map hasText documentsWithId
236 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
237 maps <- mapNodeIdNgrams <$> documentIdWithNgrams (extractNgramsT lang') documentsWithId
238 terms2id <- insertNgrams $ Map.keys maps
239 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) maps
241 lId <- getOrMkList masterCorpusId masterUserId
242 _ <- insertDocNgrams lId indexedNgrams
247 type CorpusName = Text
249 getOrMkRootWithCorpus :: (HasNodeError err, MkCorpus a)
250 => Username -> CorpusName -> Maybe a
251 -> Cmd err (UserId, RootId, CorpusId)
252 getOrMkRootWithCorpus username cName c = do
253 maybeUserId <- getUser username
254 userId <- case maybeUserId of
255 Nothing -> nodeError NoUserFound
256 Just user -> pure $ userLight_id user
258 rootId' <- map _node_id <$> getRoot username
260 rootId'' <- case rootId' of
261 [] -> mkRoot username userId
262 n -> case length n >= 2 of
263 True -> nodeError ManyNodeUsers
264 False -> pure rootId'
266 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
268 corpusId'' <- if username == userMaster
270 ns <- getCorporaWithParentId rootId
271 pure $ map _node_id ns
275 corpusId' <- if corpusId'' /= []
277 else mk (Just cName) c rootId userId
279 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
281 pure (userId, rootId, corpusId)
284 ------------------------------------------------------------------------
287 viewUniqId' :: UniqId a => a -> (HashId, a)
288 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
290 err = panic "[ERROR] Database.Flow.toInsert"
293 toInserted :: [ReturnId] -> Map HashId ReturnId
294 toInserted = Map.fromList . map (\r -> (reUniqId r, r) )
295 . filter (\r -> reInserted r == True)
297 data DocumentWithId a = DocumentWithId
298 { documentId :: !NodeId
302 instance HasText a => HasText (DocumentWithId a)
304 hasText (DocumentWithId _ a) = hasText a
306 mergeData :: Map HashId ReturnId
308 -> [DocumentWithId a]
309 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
311 toDocumentWithId (hash,hpd) =
312 DocumentWithId <$> fmap reId (lookup hash rs)
315 ------------------------------------------------------------------------
316 data DocumentIdWithNgrams a = DocumentIdWithNgrams
317 { documentWithId :: !(DocumentWithId a)
318 , document_ngrams :: !(Map Ngrams (Map NgramsType Int))
322 instance HasText HyperdataContact
326 instance ExtractNgramsT HyperdataContact
328 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
330 extract :: TermType Lang -> HyperdataContact
331 -> Cmd err (Map Ngrams (Map NgramsType Int))
333 let authors = map text2ngrams
334 $ maybe ["Nothing"] (\a -> [a])
335 $ view (hc_who . _Just . cw_lastName) hc'
337 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
339 instance HasText HyperdataDocument
341 hasText h = catMaybes [ _hyperdataDocument_title h
342 , _hyperdataDocument_abstract h
345 instance ExtractNgramsT HyperdataDocument
347 extractNgramsT :: TermType Lang -> HyperdataDocument -> Cmd err (Map Ngrams (Map NgramsType Int))
348 extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
350 extractNgramsT' :: TermType Lang -> HyperdataDocument
351 -> Cmd err (Map Ngrams (Map NgramsType Int))
352 extractNgramsT' lang' doc = do
353 let source = text2ngrams
354 $ maybe "Nothing" identity
355 $ _hyperdataDocument_source doc
357 institutes = map text2ngrams
358 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
359 $ _hyperdataDocument_institutes doc
361 authors = map text2ngrams
362 $ maybe ["Nothing"] (splitOn ", ")
363 $ _hyperdataDocument_authors doc
365 terms' <- map text2ngrams
366 <$> map (intercalate " " . _terms_label)
368 <$> liftIO (extractTerms lang' $ hasText doc)
370 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
371 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
372 <> [(a', Map.singleton Authors 1) | a' <- authors ]
373 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
376 filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
377 -> Map Ngrams (Map NgramsType Int)
378 filterNgramsT s ms = Map.fromList $ map (\a -> filter' s a) $ Map.toList ms
380 filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
382 False -> (Ngrams (Text.take s' t) n , y)
385 documentIdWithNgrams :: HasNodeError err
387 -> Cmd err (Map Ngrams (Map NgramsType Int)))
388 -> [DocumentWithId a]
389 -> Cmd err [DocumentIdWithNgrams a]
390 documentIdWithNgrams f = mapM toDocumentIdWithNgrams
392 toDocumentIdWithNgrams d = do
393 e <- f $ documentData d
394 pure $ DocumentIdWithNgrams d e
398 -- | TODO check optimization
399 mapNodeIdNgrams :: [DocumentIdWithNgrams a]
400 -> Map Ngrams (Map NgramsType (Map NodeId Int))
401 mapNodeIdNgrams = Map.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
403 f :: DocumentIdWithNgrams a
404 -> Map Ngrams (Map NgramsType (Map NodeId Int))
405 f d = fmap (fmap (Map.singleton nId)) $ document_ngrams d
407 nId = documentId $ documentWithId d
409 ------------------------------------------------------------------------
410 listInsert :: FlowCmdM env err m
411 => ListId -> Map NgramsType [NgramsElement]
413 listInsert lId ngs = mapM_ (\(typeList, ngElmts)
414 -> putListNgrams lId typeList ngElmts
417 flowList :: FlowCmdM env err m => UserId -> CorpusId
418 -> Map NgramsType [NgramsElement]
420 flowList uId cId ngs = do
421 lId <- getOrMkList cId uId
422 printDebug "listId flowList" lId
424 --trace (show $ List.filter (\n -> _ne_ngrams n == "versatile") $ List.concat $ Map.elems ngs) $ listInsert lId ngs