2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 -- check userId CanFillUserCorpus userCorpusId
13 -- check masterUserId CanFillMasterCorpus masterCorpusId
15 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
16 -- TODO-EVENTS: InsertedNodes
21 {-# LANGUAGE ConstraintKinds #-}
22 {-# LANGUAGE DeriveGeneric #-}
23 {-# LANGUAGE FlexibleContexts #-}
24 {-# LANGUAGE InstanceSigs #-}
25 {-# LANGUAGE NoImplicitPrelude #-}
26 {-# LANGUAGE OverloadedStrings #-}
27 {-# LANGUAGE RankNTypes #-}
28 {-# LANGUAGE ConstrainedClassMethods #-}
30 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
32 import Prelude (String)
33 import Debug.Trace (trace)
34 import Control.Lens ((^.), view, Lens', _Just)
35 import Control.Monad (mapM_)
36 import Control.Monad.IO.Class (liftIO)
37 import Data.List (concat)
38 import Data.Map (Map, lookup, toList)
39 import Data.Maybe (Maybe(..), catMaybes)
41 import Data.Text (Text, splitOn, intercalate)
42 import GHC.Show (Show)
43 import Gargantext.API.Ngrams (HasRepoVar)
44 import Gargantext.API.Ngrams (NgramsElement(..), putListNgrams, RepoCmdM)
45 import Gargantext.Core (Lang(..))
46 import Gargantext.Core.Types (NodePoly(..), Terms(..))
47 import Gargantext.Core.Types.Individu (Username)
48 import Gargantext.Core.Types.Main
49 import Gargantext.Database.Config (userMaster, corpusMasterName)
50 import Gargantext.Database.Flow.Utils (insertDocNgrams)
51 import Gargantext.Database.Node.Contact -- (HyperdataContact(..), ContactWho(..))
52 import Gargantext.Database.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
53 import Gargantext.Database.Root (getRoot)
54 import Gargantext.Database.Schema.Ngrams -- (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
55 import Gargantext.Database.Schema.Node -- (mkRoot, mkCorpus, getOrMkList, mkGraph, mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
56 import Gargantext.Database.Schema.User (getUser, UserLight(..))
57 import Gargantext.Database.TextSearch (searchInDatabase)
58 import Gargantext.Database.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
59 import Gargantext.Database.Utils (Cmd, CmdM)
60 import Gargantext.Ext.IMT (toSchoolName)
61 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
62 import Gargantext.Prelude
63 import Gargantext.Text.Terms.Eleve (buildTries, toToken)
64 import Gargantext.Text.List (buildNgramsLists,StopSize(..))
65 import Gargantext.Text.Parsers (parseFile, FileFormat)
66 import qualified Gargantext.Text.Parsers.IsidoreApi as Isidore
67 import Gargantext.Text.Terms (TermType(..), tt_lang, extractTerms, uniText)
68 import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
69 import Servant (ServantErr)
70 import System.FilePath (FilePath)
71 import qualified Data.List as List
72 import qualified Data.Map as Map
73 import qualified Data.Text as Text
74 import qualified Gargantext.Database.Node.Document.Add as Doc (add)
75 import qualified Gargantext.Text.Parsers.GrandDebat as GD
77 type FlowCmdM env err m =
84 type FlowCorpus a = ( AddUniqId a
91 ------------------------------------------------------------------------
93 data ApiQuery = ApiIsidoreQuery Text | ApiIsidoreAuth Text
99 -> IO [HyperdataDocument]
100 getDataApi lang limit (ApiIsidoreQuery q) = Isidore.get lang limit (Just q) Nothing
101 getDataApi lang limit (ApiIsidoreAuth q) = Isidore.get lang limit Nothing (Just q)
104 flowCorpusApi :: ( FlowCmdM env ServantErr m)
105 => Username -> CorpusName
110 flowCorpusApi u n tt l q = do
111 docs <- liftIO $ splitEvery 500 <$> getDataApi (_tt_lang tt) l q
112 flowCorpus u n tt docs
114 ------------------------------------------------------------------------
117 flowAnnuaire :: FlowCmdM env ServantErr m
118 => Username -> CorpusName -> (TermType Lang) -> FilePath -> m AnnuaireId
119 flowAnnuaire u n l filePath = do
120 docs <- liftIO $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
121 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
124 flowCorpusDebat :: FlowCmdM env ServantErr m
125 => Username -> CorpusName
128 flowCorpusDebat u n l fp = do
129 docs <- liftIO ( splitEvery 500
132 :: IO [[GD.GrandDebatReference ]]
134 flowCorpus u n (Multi FR) (map (map toHyperdataDocument) docs)
136 flowCorpusFile :: FlowCmdM env ServantErr m
137 => Username -> CorpusName
138 -> Limit -- Limit the number of docs (for dev purpose)
139 -> TermType Lang -> FileFormat -> FilePath
141 flowCorpusFile u n l la ff fp = do
142 docs <- liftIO ( splitEvery 500
146 flowCorpus u n la (map (map toHyperdataDocument) docs)
148 -- TODO query with complex query
149 flowCorpusSearchInDatabase :: FlowCmdM env err m
150 => Username -> Lang -> Text -> m CorpusId
151 flowCorpusSearchInDatabase u la q = do
152 (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus userMaster "" (Nothing :: Maybe HyperdataCorpus)
153 ids <- map fst <$> searchInDatabase cId (stemIt q)
154 flowCorpusUser la u q (Nothing :: Maybe HyperdataCorpus) ids
157 flowCorpusSearchInDatabase' :: FlowCmdM env ServantErr m
158 => Username -> Lang -> Text -> m CorpusId
159 flowCorpusSearchInDatabase' u la q = do
160 (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus userMaster "" (Nothing :: Maybe HyperdataCorpus)
161 ids <- map fst <$> searchInDatabase cId (stemIt q)
162 flowCorpusUser la u q (Nothing :: Maybe HyperdataCorpus) ids
164 ------------------------------------------------------------------------
166 flow :: (FlowCmdM env ServantErr m, FlowCorpus a, MkCorpus c)
167 => Maybe c -> Username -> CorpusName -> TermType Lang -> [[a]] -> m CorpusId
168 flow c u cn la docs = do
169 ids <- mapM (insertMasterDocs c la ) docs
170 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
172 flowCorpus :: (FlowCmdM env ServantErr m, FlowCorpus a)
173 => Username -> CorpusName -> TermType Lang -> [[a]] -> m CorpusId
174 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
177 flowCorpusUser :: (FlowCmdM env err m, MkCorpus c)
178 => Lang -> Username -> CorpusName -> Maybe c -> [NodeId] -> m CorpusId
179 flowCorpusUser l userName corpusName ctype ids = do
181 (userId, _rootId, userCorpusId) <- getOrMkRootWithCorpus userName corpusName ctype
182 -- TODO: check if present already, ignore
183 _ <- Doc.add userCorpusId ids
187 (_masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster "" ctype
188 ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
189 userListId <- flowList userId userCorpusId ngs
190 printDebug "userListId" userListId
192 _ <- mkGraph userCorpusId userId
195 -- User Dashboard Flow
196 _ <- mkDashboard userCorpusId userId
199 -- _ <- mkAnnuaire rootUserId userId
203 insertMasterDocs :: ( FlowCmdM env ServantErr m
207 => Maybe c -> TermType Lang -> [a] -> m [DocId]
208 insertMasterDocs c lang hs = do
209 (masterUserId, _, masterCorpusId) <- getOrMkRootWithCorpus userMaster corpusMasterName c
211 -- TODO Type NodeDocumentUnicised
212 let hs' = map addUniqId hs
213 ids <- insertDb masterUserId masterCorpusId hs'
214 let documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' hs')
217 fixLang (Unsupervised l n s m) = Unsupervised l n s m'
220 Nothing -> trace ("buildTries here" :: String)
222 $ buildTries n ( fmap toToken $ uniText
223 $ Text.intercalate " . "
225 $ map hasText documentsWithId
231 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
232 maps <- mapNodeIdNgrams <$> documentIdWithNgrams (extractNgramsT lang') documentsWithId
233 terms2id <- insertNgrams $ Map.keys maps
234 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) maps
236 lId <- getOrMkList masterCorpusId masterUserId
237 _ <- insertDocNgrams lId indexedNgrams
242 type CorpusName = Text
244 getOrMkRootWithCorpus :: (HasNodeError err, MkCorpus a)
245 => Username -> CorpusName -> Maybe a
246 -> Cmd err (UserId, RootId, CorpusId)
247 getOrMkRootWithCorpus username cName c = do
248 maybeUserId <- getUser username
249 userId <- case maybeUserId of
250 Nothing -> nodeError NoUserFound
251 Just user -> pure $ userLight_id user
253 rootId' <- map _node_id <$> getRoot username
255 rootId'' <- case rootId' of
256 [] -> mkRoot username userId
257 n -> case length n >= 2 of
258 True -> nodeError ManyNodeUsers
259 False -> pure rootId'
261 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
263 corpusId'' <- if username == userMaster
265 ns <- getCorporaWithParentId rootId
266 pure $ map _node_id ns
270 corpusId' <- if corpusId'' /= []
272 else mk (Just cName) c rootId userId
274 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
276 pure (userId, rootId, corpusId)
279 ------------------------------------------------------------------------
284 uniqId :: Lens' a (Maybe HashId)
287 instance UniqId HyperdataDocument
289 uniqId = hyperdataDocument_uniqId
291 instance UniqId HyperdataContact
295 viewUniqId' :: UniqId a => a -> (HashId, a)
296 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
298 err = panic "[ERROR] Database.Flow.toInsert"
301 toInserted :: [ReturnId] -> Map HashId ReturnId
302 toInserted = Map.fromList . map (\r -> (reUniqId r, r) )
303 . filter (\r -> reInserted r == True)
305 data DocumentWithId a = DocumentWithId
306 { documentId :: !NodeId
310 instance HasText a => HasText (DocumentWithId a)
312 hasText (DocumentWithId _ a) = hasText a
314 mergeData :: Map HashId ReturnId
316 -> [DocumentWithId a]
317 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
319 toDocumentWithId (hash,hpd) =
320 DocumentWithId <$> fmap reId (lookup hash rs)
323 ------------------------------------------------------------------------
324 data DocumentIdWithNgrams a = DocumentIdWithNgrams
325 { documentWithId :: !(DocumentWithId a)
326 , document_ngrams :: !(Map Ngrams (Map NgramsType Int))
330 class ExtractNgramsT h
332 extractNgramsT :: HasText h => TermType Lang -> h -> Cmd err (Map Ngrams (Map NgramsType Int))
336 hasText :: h -> [Text]
338 instance HasText HyperdataContact
342 instance ExtractNgramsT HyperdataContact
344 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
346 extract :: TermType Lang -> HyperdataContact
347 -> Cmd err (Map Ngrams (Map NgramsType Int))
349 let authors = map text2ngrams
350 $ maybe ["Nothing"] (\a -> [a])
351 $ view (hc_who . _Just . cw_lastName) hc'
353 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
355 instance HasText HyperdataDocument
357 hasText h = catMaybes [ _hyperdataDocument_title h
358 , _hyperdataDocument_abstract h
361 instance ExtractNgramsT HyperdataDocument
363 extractNgramsT :: TermType Lang -> HyperdataDocument -> Cmd err (Map Ngrams (Map NgramsType Int))
364 extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
366 extractNgramsT' :: TermType Lang -> HyperdataDocument
367 -> Cmd err (Map Ngrams (Map NgramsType Int))
368 extractNgramsT' lang' doc = do
369 let source = text2ngrams
370 $ maybe "Nothing" identity
371 $ _hyperdataDocument_source doc
373 institutes = map text2ngrams
374 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
375 $ _hyperdataDocument_institutes doc
377 authors = map text2ngrams
378 $ maybe ["Nothing"] (splitOn ", ")
379 $ _hyperdataDocument_authors doc
381 terms' <- map text2ngrams
382 <$> map (intercalate " " . _terms_label)
384 <$> liftIO (extractTerms lang' $ hasText doc)
386 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
387 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
388 <> [(a', Map.singleton Authors 1) | a' <- authors ]
389 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
392 filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
393 -> Map Ngrams (Map NgramsType Int)
394 filterNgramsT s ms = Map.fromList $ map (\a -> filter' s a) $ Map.toList ms
396 filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
398 False -> (Ngrams (Text.take s' t) n , y)
401 documentIdWithNgrams :: HasNodeError err
403 -> Cmd err (Map Ngrams (Map NgramsType Int)))
404 -> [DocumentWithId a]
405 -> Cmd err [DocumentIdWithNgrams a]
406 documentIdWithNgrams f = mapM toDocumentIdWithNgrams
408 toDocumentIdWithNgrams d = do
409 e <- f $ documentData d
410 pure $ DocumentIdWithNgrams d e
414 -- | TODO check optimization
415 mapNodeIdNgrams :: [DocumentIdWithNgrams a]
416 -> Map Ngrams (Map NgramsType (Map NodeId Int))
417 mapNodeIdNgrams = Map.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
419 f :: DocumentIdWithNgrams a
420 -> Map Ngrams (Map NgramsType (Map NodeId Int))
421 f d = fmap (fmap (Map.singleton nId)) $ document_ngrams d
423 nId = documentId $ documentWithId d
425 ------------------------------------------------------------------------
426 listInsert :: FlowCmdM env err m
427 => ListId -> Map NgramsType [NgramsElement]
429 listInsert lId ngs = mapM_ (\(typeList, ngElmts)
430 -> putListNgrams lId typeList ngElmts
433 flowList :: FlowCmdM env err m => UserId -> CorpusId
434 -> Map NgramsType [NgramsElement]
436 flowList uId cId ngs = do
437 lId <- getOrMkList cId uId
438 printDebug "listId flowList" lId
440 --trace (show $ List.filter (\n -> _ne_ngrams n == "versatile") $ List.concat $ Map.elems ngs) $ listInsert lId ngs