2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 -- check userId CanFillUserCorpus userCorpusId
13 -- check masterUserId CanFillMasterCorpus masterCorpusId
15 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
16 -- TODO-EVENTS: InsertedNodes
21 {-# OPTIONS_GHC -fno-warn-orphans #-}
23 {-# LANGUAGE ConstraintKinds #-}
24 {-# LANGUAGE RankNTypes #-}
25 {-# LANGUAGE ConstrainedClassMethods #-}
26 {-# LANGUAGE ConstraintKinds #-}
27 {-# LANGUAGE DeriveGeneric #-}
28 {-# LANGUAGE FlexibleContexts #-}
29 {-# LANGUAGE InstanceSigs #-}
30 {-# LANGUAGE NoImplicitPrelude #-}
31 {-# LANGUAGE OverloadedStrings #-}
33 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
35 import Prelude (String)
37 import Debug.Trace (trace)
38 import Control.Lens ((^.), view, _Just)
39 import Control.Monad (mapM_)
40 import Control.Monad.IO.Class (liftIO)
41 import Data.List (concat)
42 import Data.Map (Map, lookup, toList)
43 import Data.Maybe (Maybe(..), catMaybes)
45 import Data.Text (Text, splitOn, intercalate)
46 import GHC.Show (Show)
47 import Gargantext.API.Ngrams (HasRepoVar)
48 import Gargantext.API.Ngrams (NgramsElement(..), putListNgrams, RepoCmdM)
49 import Gargantext.Core (Lang(..))
50 import Gargantext.Core.Types (NodePoly(..), Terms(..))
51 import Gargantext.Core.Types.Individu (Username)
52 import Gargantext.Core.Flow
53 import Gargantext.Core.Types.Main
54 import Gargantext.Database.Config (userMaster, corpusMasterName)
55 import Gargantext.Database.Flow.Utils (insertDocNgrams)
56 import Gargantext.Database.Node.Contact -- (HyperdataContact(..), ContactWho(..))
57 import Gargantext.Database.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
58 import Gargantext.Database.Root (getRoot)
59 import Gargantext.Database.Schema.Ngrams -- (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
60 import Gargantext.Database.Schema.Node -- (mkRoot, mkCorpus, getOrMkList, mkGraph, mkPhylo, mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
61 import Gargantext.Database.Schema.User (getUser, UserLight(..))
62 import Gargantext.Database.TextSearch (searchInDatabase)
63 import Gargantext.Database.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
64 import Gargantext.Database.Utils (Cmd, CmdM)
65 import Gargantext.Ext.IMT (toSchoolName)
66 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
67 import Gargantext.Prelude
68 import Gargantext.Text.Terms.Eleve (buildTries, toToken)
69 import Gargantext.Text.List (buildNgramsLists,StopSize(..))
70 import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat)
71 import qualified Gargantext.Text.Corpus.API.Isidore as Isidore
72 import Gargantext.Text.Terms (TermType(..), tt_lang, extractTerms, uniText)
73 import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
74 import Gargantext.Prelude.Utils hiding (hash)
75 import System.FilePath (FilePath)
76 import qualified Data.List as List
77 import qualified Data.Map as Map
78 import qualified Data.Text as Text
79 import qualified Gargantext.Database.Node.Document.Add as Doc (add)
80 import qualified Gargantext.Text.Corpus.Parsers.GrandDebat as GD
82 type FlowCmdM env err m =
89 ------------------------------------------------------------------------
91 data ApiQuery = ApiIsidoreQuery Text | ApiIsidoreAuth Text
97 -> IO [HyperdataDocument]
98 getDataApi lang limit (ApiIsidoreQuery q) = Isidore.get lang limit (Just q) Nothing
99 getDataApi lang limit (ApiIsidoreAuth q) = Isidore.get lang limit Nothing (Just q)
102 flowCorpusApi :: ( FlowCmdM env err m)
103 => Username -> Either CorpusName [CorpusId]
108 flowCorpusApi u n tt l q = do
109 docs <- liftIO $ splitEvery 500 <$> getDataApi (_tt_lang tt) l q
110 flowCorpus u n tt docs
112 ------------------------------------------------------------------------
114 flowAnnuaire :: FlowCmdM env err m
115 => Username -> Either CorpusName [CorpusId] -> (TermType Lang) -> FilePath -> m AnnuaireId
116 flowAnnuaire u n l filePath = do
117 docs <- liftIO $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
118 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
121 flowCorpusDebat :: FlowCmdM env err m
122 => Username -> Either CorpusName [CorpusId]
125 flowCorpusDebat u n l fp = do
126 docs <- liftIO ( splitEvery 500
129 :: IO [[GD.GrandDebatReference ]]
131 flowCorpus u n (Multi FR) (map (map toHyperdataDocument) docs)
133 flowCorpusFile :: FlowCmdM env err m
134 => Username -> Either CorpusName [CorpusId]
135 -> Limit -- Limit the number of docs (for dev purpose)
136 -> TermType Lang -> FileFormat -> FilePath
138 flowCorpusFile u n l la ff fp = do
139 docs <- liftIO ( splitEvery 500
143 flowCorpus u n la (map (map toHyperdataDocument) docs)
145 -- TODO query with complex query
146 flowCorpusSearchInDatabase :: FlowCmdM env err m
147 => Username -> Lang -> Text -> m CorpusId
148 flowCorpusSearchInDatabase u la q = do
149 (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus userMaster (Left "") (Nothing :: Maybe HyperdataCorpus)
150 ids <- map fst <$> searchInDatabase cId (stemIt q)
151 flowCorpusUser la u (Left q) (Nothing :: Maybe HyperdataCorpus) ids
154 flowCorpusSearchInDatabaseApi :: FlowCmdM env err m
155 => Username -> Lang -> Text -> m CorpusId
156 flowCorpusSearchInDatabaseApi u la q = do
157 (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus userMaster (Left "") (Nothing :: Maybe HyperdataCorpus)
158 ids <- map fst <$> searchInDatabase cId (stemIt q)
159 flowCorpusUser la u (Left q) (Nothing :: Maybe HyperdataCorpus) ids
161 ------------------------------------------------------------------------
162 -- | TODO improve the needed type to create/update a corpus
163 data UserInfo = Username Text
165 data CorpusInfo = CorpusName Lang Text
166 | CorpusId Lang NodeId
169 flow :: (FlowCmdM env err m, FlowCorpus a, MkCorpus c)
170 => Maybe c -> Username -> Either CorpusName [CorpusId] -> TermType Lang -> [[a]] -> m CorpusId
171 flow c u cn la docs = do
172 ids <- mapM (insertMasterDocs c la ) docs
173 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
175 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
176 => Username -> Either CorpusName [CorpusId] -> TermType Lang -> [[a]] -> m CorpusId
177 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
179 ------------------------------------------------------------------------
182 flowCorpusUser :: (FlowCmdM env err m, MkCorpus c)
183 => Lang -> Username -> Either CorpusName [CorpusId] -> Maybe c -> [NodeId] -> m CorpusId
184 flowCorpusUser l userName corpusName ctype ids = do
186 (userId, _rootId, userCorpusId) <- getOrMkRootWithCorpus userName corpusName ctype
187 -- TODO: check if present already, ignore
188 _ <- Doc.add userCorpusId ids
189 tId <- mkNode NodeTexts userCorpusId userId
191 printDebug "Node Text Id" tId
195 (_masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster (Left "") ctype
196 ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
197 userListId <- flowList userId userCorpusId ngs
198 printDebug "userListId" userListId
200 _ <- mkDashboard userCorpusId userId
201 _ <- mkGraph userCorpusId userId
202 _ <- mkPhylo userCorpusId userId
207 -- _ <- mkAnnuaire rootUserId userId
211 insertMasterDocs :: ( FlowCmdM env err m
215 => Maybe c -> TermType Lang -> [a] -> m [DocId]
216 insertMasterDocs c lang hs = do
217 (masterUserId, _, masterCorpusId) <- getOrMkRootWithCorpus userMaster (Left corpusMasterName) c
219 -- TODO Type NodeDocumentUnicised
220 let hs' = map addUniqId hs
221 ids <- insertDb masterUserId masterCorpusId hs'
222 let documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' hs')
225 fixLang (Unsupervised l n s m) = Unsupervised l n s m'
228 Nothing -> trace ("buildTries here" :: String)
230 $ buildTries n ( fmap toToken $ uniText
231 $ Text.intercalate " . "
233 $ map hasText documentsWithId
239 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
240 maps <- mapNodeIdNgrams <$> documentIdWithNgrams (extractNgramsT lang') documentsWithId
241 terms2id <- insertNgrams $ Map.keys maps
242 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) maps
244 lId <- getOrMkList masterCorpusId masterUserId
245 _ <- insertDocNgrams lId indexedNgrams
250 type CorpusName = Text
252 getOrMkRootWithCorpus :: (HasNodeError err, MkCorpus a)
253 => Username -> Either CorpusName [CorpusId] -> Maybe a
254 -> Cmd err (UserId, RootId, CorpusId)
255 getOrMkRootWithCorpus username cName c = do
256 maybeUserId <- getUser username
257 userId <- case maybeUserId of
258 Nothing -> nodeError NoUserFound
259 Just user -> pure $ userLight_id user
261 rootId' <- map _node_id <$> getRoot username
263 rootId'' <- case rootId' of
264 [] -> mkRoot username userId
265 n -> case length n >= 2 of
266 True -> nodeError ManyNodeUsers
267 False -> pure rootId'
269 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
271 corpusId'' <- if username == userMaster
273 ns <- getCorporaWithParentId rootId
274 pure $ map _node_id ns
276 pure $ fromRight [] cName
278 corpusId' <- if corpusId'' /= []
280 else mk (Just $ fromLeft "Default" cName) c rootId userId
282 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
284 pure (userId, rootId, corpusId)
287 ------------------------------------------------------------------------
290 viewUniqId' :: UniqId a => a -> (HashId, a)
291 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
293 err = panic "[ERROR] Database.Flow.toInsert"
296 toInserted :: [ReturnId] -> Map HashId ReturnId
297 toInserted = Map.fromList . map (\r -> (reUniqId r, r) )
298 . filter (\r -> reInserted r == True)
300 data DocumentWithId a = DocumentWithId
301 { documentId :: !NodeId
305 instance HasText a => HasText (DocumentWithId a)
307 hasText (DocumentWithId _ a) = hasText a
309 mergeData :: Map HashId ReturnId
311 -> [DocumentWithId a]
312 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
314 toDocumentWithId (hash,hpd) =
315 DocumentWithId <$> fmap reId (lookup hash rs)
318 ------------------------------------------------------------------------
319 data DocumentIdWithNgrams a = DocumentIdWithNgrams
320 { documentWithId :: !(DocumentWithId a)
321 , document_ngrams :: !(Map Ngrams (Map NgramsType Int))
325 instance HasText HyperdataContact
329 instance ExtractNgramsT HyperdataContact
331 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
333 extract :: TermType Lang -> HyperdataContact
334 -> Cmd err (Map Ngrams (Map NgramsType Int))
336 let authors = map text2ngrams
337 $ maybe ["Nothing"] (\a -> [a])
338 $ view (hc_who . _Just . cw_lastName) hc'
340 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
342 instance HasText HyperdataDocument
344 hasText h = catMaybes [ _hyperdataDocument_title h
345 , _hyperdataDocument_abstract h
348 instance ExtractNgramsT HyperdataDocument
350 extractNgramsT :: TermType Lang -> HyperdataDocument -> Cmd err (Map Ngrams (Map NgramsType Int))
351 extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
353 extractNgramsT' :: TermType Lang -> HyperdataDocument
354 -> Cmd err (Map Ngrams (Map NgramsType Int))
355 extractNgramsT' lang' doc = do
356 let source = text2ngrams
357 $ maybe "Nothing" identity
358 $ _hyperdataDocument_source doc
360 institutes = map text2ngrams
361 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
362 $ _hyperdataDocument_institutes doc
364 authors = map text2ngrams
365 $ maybe ["Nothing"] (splitOn ", ")
366 $ _hyperdataDocument_authors doc
368 terms' <- map text2ngrams
369 <$> map (intercalate " " . _terms_label)
371 <$> liftIO (extractTerms lang' $ hasText doc)
373 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
374 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
375 <> [(a', Map.singleton Authors 1) | a' <- authors ]
376 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
379 filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
380 -> Map Ngrams (Map NgramsType Int)
381 filterNgramsT s ms = Map.fromList $ map (\a -> filter' s a) $ Map.toList ms
383 filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
385 False -> (Ngrams (Text.take s' t) n , y)
388 documentIdWithNgrams :: HasNodeError err
390 -> Cmd err (Map Ngrams (Map NgramsType Int)))
391 -> [DocumentWithId a]
392 -> Cmd err [DocumentIdWithNgrams a]
393 documentIdWithNgrams f = mapM toDocumentIdWithNgrams
395 toDocumentIdWithNgrams d = do
396 e <- f $ documentData d
397 pure $ DocumentIdWithNgrams d e
401 -- | TODO check optimization
402 mapNodeIdNgrams :: [DocumentIdWithNgrams a]
403 -> Map Ngrams (Map NgramsType (Map NodeId Int))
404 mapNodeIdNgrams = Map.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
406 f :: DocumentIdWithNgrams a
407 -> Map Ngrams (Map NgramsType (Map NodeId Int))
408 f d = fmap (fmap (Map.singleton nId)) $ document_ngrams d
410 nId = documentId $ documentWithId d
412 ------------------------------------------------------------------------
413 listInsert :: FlowCmdM env err m
414 => ListId -> Map NgramsType [NgramsElement]
416 listInsert lId ngs = mapM_ (\(typeList, ngElmts)
417 -> putListNgrams lId typeList ngElmts
420 flowList :: FlowCmdM env err m => UserId -> CorpusId
421 -> Map NgramsType [NgramsElement]
423 flowList uId cId ngs = do
424 lId <- getOrMkList cId uId
425 printDebug "listId flowList" lId
427 --trace (show $ List.filter (\n -> _ne_ngrams n == "versatile") $ List.concat $ Map.elems ngs) $ listInsert lId ngs