2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 -- check userId CanFillUserCorpus userCorpusId
13 -- check masterUserId CanFillMasterCorpus masterCorpusId
15 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
16 -- TODO-EVENTS: InsertedNodes
21 {-# LANGUAGE ConstraintKinds #-}
22 {-# LANGUAGE DeriveGeneric #-}
23 {-# LANGUAGE FlexibleContexts #-}
24 {-# LANGUAGE InstanceSigs #-}
25 {-# LANGUAGE NoImplicitPrelude #-}
26 {-# LANGUAGE OverloadedStrings #-}
27 {-# LANGUAGE RankNTypes #-}
28 {-# LANGUAGE ConstrainedClassMethods #-}
30 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
32 import Prelude (String)
33 import Debug.Trace (trace)
34 import Control.Lens ((^.), view, Lens', _Just)
35 import Control.Monad (mapM_)
36 import Control.Monad.IO.Class (liftIO)
37 import Data.List (concat)
38 import Data.Map (Map, lookup, toList)
39 import Data.Maybe (Maybe(..), catMaybes)
41 import Data.Text (Text, splitOn, intercalate)
42 import GHC.Show (Show)
43 import Gargantext.API.Ngrams (HasRepoVar)
44 import Gargantext.API.Ngrams (NgramsElement(..), putListNgrams, RepoCmdM)
45 import Gargantext.Core (Lang(..))
46 import Gargantext.Core.Types (NodePoly(..), Terms(..))
47 import Gargantext.Core.Types.Individu (Username)
48 import Gargantext.Core.Types.Main
49 import Gargantext.Database.Config (userMaster, corpusMasterName)
50 import Gargantext.Database.Flow.Utils (insertDocNgrams)
51 import Gargantext.Database.Node.Contact -- (HyperdataContact(..), ContactWho(..))
52 import Gargantext.Database.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
53 import Gargantext.Database.Root (getRoot)
54 import Gargantext.Database.Schema.Ngrams -- (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
55 import Gargantext.Database.Schema.Node -- (mkRoot, mkCorpus, getOrMkList, mkGraph, mkPhylo, mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
56 import Gargantext.Database.Schema.User (getUser, UserLight(..))
57 import Gargantext.Database.TextSearch (searchInDatabase)
58 import Gargantext.Database.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
59 import Gargantext.Database.Utils (Cmd, CmdM)
60 import Gargantext.Ext.IMT (toSchoolName)
61 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
62 import Gargantext.Prelude
63 import Gargantext.Text.Terms.Eleve (buildTries, toToken)
64 import Gargantext.Text.List (buildNgramsLists,StopSize(..))
65 import Gargantext.Text.Parsers (parseFile, FileFormat)
66 import qualified Gargantext.Text.Parsers.IsidoreApi as Isidore
67 import Gargantext.Text.Terms (TermType(..), tt_lang, extractTerms, uniText)
68 import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
69 import Servant (ServantErr)
70 import System.FilePath (FilePath)
71 import qualified Data.List as List
72 import qualified Data.Map as Map
73 import qualified Data.Text as Text
74 import qualified Gargantext.Database.Node.Document.Add as Doc (add)
75 import qualified Gargantext.Text.Parsers.GrandDebat as GD
77 type FlowCmdM env err m =
84 type FlowCorpus a = ( AddUniqId a
91 ------------------------------------------------------------------------
93 data ApiQuery = ApiIsidoreQuery Text | ApiIsidoreAuth Text
99 -> IO [HyperdataDocument]
100 getDataApi lang limit (ApiIsidoreQuery q) = Isidore.get lang limit (Just q) Nothing
101 getDataApi lang limit (ApiIsidoreAuth q) = Isidore.get lang limit Nothing (Just q)
104 flowCorpusApi :: ( FlowCmdM env ServantErr m)
105 => Username -> CorpusName
110 flowCorpusApi u n tt l q = do
111 docs <- liftIO $ splitEvery 500 <$> getDataApi (_tt_lang tt) l q
112 flowCorpus u n tt docs
114 ------------------------------------------------------------------------
117 flowAnnuaire :: FlowCmdM env ServantErr m
118 => Username -> CorpusName -> (TermType Lang) -> FilePath -> m AnnuaireId
119 flowAnnuaire u n l filePath = do
120 docs <- liftIO $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
121 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
124 flowCorpusDebat :: FlowCmdM env ServantErr m
125 => Username -> CorpusName
128 flowCorpusDebat u n l fp = do
129 docs <- liftIO ( splitEvery 500
132 :: IO [[GD.GrandDebatReference ]]
134 flowCorpus u n (Multi FR) (map (map toHyperdataDocument) docs)
136 flowCorpusFile :: FlowCmdM env ServantErr m
137 => Username -> CorpusName
138 -> Limit -- Limit the number of docs (for dev purpose)
139 -> TermType Lang -> FileFormat -> FilePath
141 flowCorpusFile u n l la ff fp = do
142 docs <- liftIO ( splitEvery 500
146 flowCorpus u n la (map (map toHyperdataDocument) docs)
148 -- TODO query with complex query
149 flowCorpusSearchInDatabase :: FlowCmdM env err m
150 => Username -> Lang -> Text -> m CorpusId
151 flowCorpusSearchInDatabase u la q = do
152 (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus userMaster "" (Nothing :: Maybe HyperdataCorpus)
153 ids <- map fst <$> searchInDatabase cId (stemIt q)
154 flowCorpusUser la u q (Nothing :: Maybe HyperdataCorpus) ids
157 flowCorpusSearchInDatabase' :: FlowCmdM env ServantErr m
158 => Username -> Lang -> Text -> m CorpusId
159 flowCorpusSearchInDatabase' u la q = do
160 (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus userMaster "" (Nothing :: Maybe HyperdataCorpus)
161 ids <- map fst <$> searchInDatabase cId (stemIt q)
162 flowCorpusUser la u q (Nothing :: Maybe HyperdataCorpus) ids
164 ------------------------------------------------------------------------
166 flow :: (FlowCmdM env ServantErr m, FlowCorpus a, MkCorpus c)
167 => Maybe c -> Username -> CorpusName -> TermType Lang -> [[a]] -> m CorpusId
168 flow c u cn la docs = do
169 ids <- mapM (insertMasterDocs c la ) docs
170 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
172 flowCorpus :: (FlowCmdM env ServantErr m, FlowCorpus a)
173 => Username -> CorpusName -> TermType Lang -> [[a]] -> m CorpusId
174 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
177 flowCorpusUser :: (FlowCmdM env err m, MkCorpus c)
178 => Lang -> Username -> CorpusName -> Maybe c -> [NodeId] -> m CorpusId
179 flowCorpusUser l userName corpusName ctype ids = do
181 (userId, _rootId, userCorpusId) <- getOrMkRootWithCorpus userName corpusName ctype
182 -- TODO: check if present already, ignore
183 _ <- Doc.add userCorpusId ids
187 (_masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster "" ctype
188 ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
189 userListId <- flowList userId userCorpusId ngs
190 printDebug "userListId" userListId
192 _ <- mkGraph userCorpusId userId
193 _ <- mkPhylo userCorpusId userId
196 -- User Dashboard Flow
197 _ <- mkDashboard userCorpusId userId
200 -- _ <- mkAnnuaire rootUserId userId
204 insertMasterDocs :: ( FlowCmdM env ServantErr m
208 => Maybe c -> TermType Lang -> [a] -> m [DocId]
209 insertMasterDocs c lang hs = do
210 (masterUserId, _, masterCorpusId) <- getOrMkRootWithCorpus userMaster corpusMasterName c
212 -- TODO Type NodeDocumentUnicised
213 let hs' = map addUniqId hs
214 ids <- insertDb masterUserId masterCorpusId hs'
215 let documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' hs')
218 fixLang (Unsupervised l n s m) = Unsupervised l n s m'
221 Nothing -> trace ("buildTries here" :: String)
223 $ buildTries n ( fmap toToken $ uniText
224 $ Text.intercalate " . "
226 $ map hasText documentsWithId
232 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
233 maps <- mapNodeIdNgrams <$> documentIdWithNgrams (extractNgramsT lang') documentsWithId
234 terms2id <- insertNgrams $ Map.keys maps
235 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) maps
237 lId <- getOrMkList masterCorpusId masterUserId
238 _ <- insertDocNgrams lId indexedNgrams
243 type CorpusName = Text
245 getOrMkRootWithCorpus :: (HasNodeError err, MkCorpus a)
246 => Username -> CorpusName -> Maybe a
247 -> Cmd err (UserId, RootId, CorpusId)
248 getOrMkRootWithCorpus username cName c = do
249 maybeUserId <- getUser username
250 userId <- case maybeUserId of
251 Nothing -> nodeError NoUserFound
252 Just user -> pure $ userLight_id user
254 rootId' <- map _node_id <$> getRoot username
256 rootId'' <- case rootId' of
257 [] -> mkRoot username userId
258 n -> case length n >= 2 of
259 True -> nodeError ManyNodeUsers
260 False -> pure rootId'
262 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
264 corpusId'' <- if username == userMaster
266 ns <- getCorporaWithParentId rootId
267 pure $ map _node_id ns
271 corpusId' <- if corpusId'' /= []
273 else mk (Just cName) c rootId userId
275 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
277 pure (userId, rootId, corpusId)
280 ------------------------------------------------------------------------
285 uniqId :: Lens' a (Maybe HashId)
288 instance UniqId HyperdataDocument
290 uniqId = hyperdataDocument_uniqId
292 instance UniqId HyperdataContact
296 viewUniqId' :: UniqId a => a -> (HashId, a)
297 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
299 err = panic "[ERROR] Database.Flow.toInsert"
302 toInserted :: [ReturnId] -> Map HashId ReturnId
303 toInserted = Map.fromList . map (\r -> (reUniqId r, r) )
304 . filter (\r -> reInserted r == True)
306 data DocumentWithId a = DocumentWithId
307 { documentId :: !NodeId
311 instance HasText a => HasText (DocumentWithId a)
313 hasText (DocumentWithId _ a) = hasText a
315 mergeData :: Map HashId ReturnId
317 -> [DocumentWithId a]
318 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
320 toDocumentWithId (hash,hpd) =
321 DocumentWithId <$> fmap reId (lookup hash rs)
324 ------------------------------------------------------------------------
325 data DocumentIdWithNgrams a = DocumentIdWithNgrams
326 { documentWithId :: !(DocumentWithId a)
327 , document_ngrams :: !(Map Ngrams (Map NgramsType Int))
331 class ExtractNgramsT h
333 extractNgramsT :: HasText h => TermType Lang -> h -> Cmd err (Map Ngrams (Map NgramsType Int))
337 hasText :: h -> [Text]
339 instance HasText HyperdataContact
343 instance ExtractNgramsT HyperdataContact
345 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
347 extract :: TermType Lang -> HyperdataContact
348 -> Cmd err (Map Ngrams (Map NgramsType Int))
350 let authors = map text2ngrams
351 $ maybe ["Nothing"] (\a -> [a])
352 $ view (hc_who . _Just . cw_lastName) hc'
354 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
356 instance HasText HyperdataDocument
358 hasText h = catMaybes [ _hyperdataDocument_title h
359 , _hyperdataDocument_abstract h
362 instance ExtractNgramsT HyperdataDocument
364 extractNgramsT :: TermType Lang -> HyperdataDocument -> Cmd err (Map Ngrams (Map NgramsType Int))
365 extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
367 extractNgramsT' :: TermType Lang -> HyperdataDocument
368 -> Cmd err (Map Ngrams (Map NgramsType Int))
369 extractNgramsT' lang' doc = do
370 let source = text2ngrams
371 $ maybe "Nothing" identity
372 $ _hyperdataDocument_source doc
374 institutes = map text2ngrams
375 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
376 $ _hyperdataDocument_institutes doc
378 authors = map text2ngrams
379 $ maybe ["Nothing"] (splitOn ", ")
380 $ _hyperdataDocument_authors doc
382 terms' <- map text2ngrams
383 <$> map (intercalate " " . _terms_label)
385 <$> liftIO (extractTerms lang' $ hasText doc)
387 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
388 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
389 <> [(a', Map.singleton Authors 1) | a' <- authors ]
390 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
393 filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
394 -> Map Ngrams (Map NgramsType Int)
395 filterNgramsT s ms = Map.fromList $ map (\a -> filter' s a) $ Map.toList ms
397 filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
399 False -> (Ngrams (Text.take s' t) n , y)
402 documentIdWithNgrams :: HasNodeError err
404 -> Cmd err (Map Ngrams (Map NgramsType Int)))
405 -> [DocumentWithId a]
406 -> Cmd err [DocumentIdWithNgrams a]
407 documentIdWithNgrams f = mapM toDocumentIdWithNgrams
409 toDocumentIdWithNgrams d = do
410 e <- f $ documentData d
411 pure $ DocumentIdWithNgrams d e
415 -- | TODO check optimization
416 mapNodeIdNgrams :: [DocumentIdWithNgrams a]
417 -> Map Ngrams (Map NgramsType (Map NodeId Int))
418 mapNodeIdNgrams = Map.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
420 f :: DocumentIdWithNgrams a
421 -> Map Ngrams (Map NgramsType (Map NodeId Int))
422 f d = fmap (fmap (Map.singleton nId)) $ document_ngrams d
424 nId = documentId $ documentWithId d
426 ------------------------------------------------------------------------
427 listInsert :: FlowCmdM env err m
428 => ListId -> Map NgramsType [NgramsElement]
430 listInsert lId ngs = mapM_ (\(typeList, ngElmts)
431 -> putListNgrams lId typeList ngElmts
434 flowList :: FlowCmdM env err m => UserId -> CorpusId
435 -> Map NgramsType [NgramsElement]
437 flowList uId cId ngs = do
438 lId <- getOrMkList cId uId
439 printDebug "listId flowList" lId
441 --trace (show $ List.filter (\n -> _ne_ngrams n == "versatile") $ List.concat $ Map.elems ngs) $ listInsert lId ngs