2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 -- check userId CanFillUserCorpus userCorpusId
13 -- check masterUserId CanFillMasterCorpus masterCorpusId
15 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
16 -- TODO-EVENTS: InsertedNodes
21 {-# OPTIONS_GHC -fno-warn-orphans #-}
23 {-# LANGUAGE ConstraintKinds #-}
24 {-# LANGUAGE RankNTypes #-}
25 {-# LANGUAGE ConstrainedClassMethods #-}
26 {-# LANGUAGE ConstraintKinds #-}
27 {-# LANGUAGE DeriveGeneric #-}
28 {-# LANGUAGE FlexibleContexts #-}
29 {-# LANGUAGE InstanceSigs #-}
30 {-# LANGUAGE NoImplicitPrelude #-}
31 {-# LANGUAGE OverloadedStrings #-}
33 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
35 import Prelude (String)
37 import Debug.Trace (trace)
38 import Control.Lens ((^.), view, _Just)
39 import Control.Monad (mapM_)
40 import Control.Monad.IO.Class (liftIO)
41 import Data.List (concat)
42 import Data.Map (Map, lookup, toList)
43 import Data.Maybe (Maybe(..), catMaybes)
45 import Data.Text (Text, splitOn, intercalate)
46 import GHC.Show (Show)
47 import Gargantext.API.Ngrams (HasRepoVar)
48 import Gargantext.API.Ngrams (NgramsElement(..), putListNgrams, RepoCmdM)
49 import Gargantext.Core (Lang(..))
50 import Gargantext.Core.Types (NodePoly(..), Terms(..))
51 import Gargantext.Core.Types.Individu (Username)
52 import Gargantext.Core.Flow
53 import Gargantext.Core.Types.Main
54 import Gargantext.Database.Config (userMaster, corpusMasterName)
55 import Gargantext.Database.Flow.Utils (insertDocNgrams)
56 import Gargantext.Database.Node.Contact -- (HyperdataContact(..), ContactWho(..))
57 import Gargantext.Database.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
58 import Gargantext.Database.Root (getRoot)
59 import Gargantext.Database.Schema.Ngrams -- (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
60 import Gargantext.Database.Schema.Node -- (mkRoot, mkCorpus, getOrMkList, mkGraph, mkPhylo, mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
61 import Gargantext.Database.Schema.User (getUser, UserLight(..))
62 import Gargantext.Database.TextSearch (searchInDatabase)
63 import Gargantext.Database.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
64 import Gargantext.Database.Utils (Cmd, CmdM)
65 import Gargantext.Ext.IMT (toSchoolName)
66 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
67 import Gargantext.Prelude
68 import Gargantext.Text.Terms.Eleve (buildTries, toToken)
69 import Gargantext.Text.List (buildNgramsLists,StopSize(..))
70 import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat)
71 import qualified Gargantext.Text.Corpus.API.Isidore as Isidore
72 import Gargantext.Text.Terms (TermType(..), tt_lang, extractTerms, uniText)
73 import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
74 import Gargantext.Prelude.Utils hiding (hash)
75 import System.FilePath (FilePath)
76 import qualified Data.List as List
77 import qualified Data.Map as Map
78 import qualified Data.Text as Text
79 import qualified Gargantext.Database.Node.Document.Add as Doc (add)
80 import qualified Gargantext.Text.Corpus.Parsers.GrandDebat as GD
82 type FlowCmdM env err m =
89 ------------------------------------------------------------------------
91 data ApiQuery = ApiIsidoreQuery Text | ApiIsidoreAuth Text
97 -> IO [HyperdataDocument]
98 getDataApi lang limit (ApiIsidoreQuery q) = Isidore.get lang limit (Just q) Nothing
99 getDataApi lang limit (ApiIsidoreAuth q) = Isidore.get lang limit Nothing (Just q)
102 flowCorpusApi :: ( FlowCmdM env err m)
103 => Username -> Either CorpusName [CorpusId]
108 flowCorpusApi u n tt l q = do
109 docs <- liftIO $ splitEvery 500 <$> getDataApi (_tt_lang tt) l q
110 flowCorpus u n tt docs
112 ------------------------------------------------------------------------
114 flowAnnuaire :: FlowCmdM env err m
115 => Username -> Either CorpusName [CorpusId] -> (TermType Lang) -> FilePath -> m AnnuaireId
116 flowAnnuaire u n l filePath = do
117 docs <- liftIO $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
118 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
121 flowCorpusDebat :: FlowCmdM env err m
122 => Username -> Either CorpusName [CorpusId]
125 flowCorpusDebat u n l fp = do
126 docs <- liftIO ( splitEvery 500
129 :: IO [[GD.GrandDebatReference ]]
131 flowCorpus u n (Multi FR) (map (map toHyperdataDocument) docs)
133 flowCorpusFile :: FlowCmdM env err m
134 => Username -> Either CorpusName [CorpusId]
135 -> Limit -- Limit the number of docs (for dev purpose)
136 -> TermType Lang -> FileFormat -> FilePath
138 flowCorpusFile u n l la ff fp = do
139 docs <- liftIO ( splitEvery 500
143 flowCorpus u n la (map (map toHyperdataDocument) docs)
145 -- TODO query with complex query
146 flowCorpusSearchInDatabase :: FlowCmdM env err m
147 => Username -> Lang -> Text -> m CorpusId
148 flowCorpusSearchInDatabase u la q = do
149 (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus userMaster (Left "") (Nothing :: Maybe HyperdataCorpus)
150 ids <- map fst <$> searchInDatabase cId (stemIt q)
151 flowCorpusUser la u (Left q) (Nothing :: Maybe HyperdataCorpus) ids
154 flowCorpusSearchInDatabaseApi :: FlowCmdM env err m
155 => Username -> Lang -> Text -> m CorpusId
156 flowCorpusSearchInDatabaseApi u la q = do
157 (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus userMaster (Left "") (Nothing :: Maybe HyperdataCorpus)
158 ids <- map fst <$> searchInDatabase cId (stemIt q)
159 flowCorpusUser la u (Left q) (Nothing :: Maybe HyperdataCorpus) ids
161 ------------------------------------------------------------------------
162 -- | TODO improve the needed type to create/update a corpus
163 data UserInfo = Username Text
165 data CorpusInfo = CorpusName Lang Text
166 | CorpusId Lang NodeId
169 flow :: (FlowCmdM env err m, FlowCorpus a, MkCorpus c)
170 => Maybe c -> Username -> Either CorpusName [CorpusId] -> TermType Lang -> [[a]] -> m CorpusId
171 flow c u cn la docs = do
172 ids <- mapM (insertMasterDocs c la ) docs
173 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
175 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
176 => Username -> Either CorpusName [CorpusId] -> TermType Lang -> [[a]] -> m CorpusId
177 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
179 ------------------------------------------------------------------------
182 flowCorpusUser :: (FlowCmdM env err m, MkCorpus c)
183 => Lang -> Username -> Either CorpusName [CorpusId] -> Maybe c -> [NodeId] -> m CorpusId
184 flowCorpusUser l userName corpusName ctype ids = do
186 (userId, _rootId, userCorpusId) <- getOrMkRootWithCorpus userName corpusName ctype
187 -- TODO: check if present already, ignore
188 _ <- Doc.add userCorpusId ids
189 tId <- mkTexts userCorpusId userId
190 printDebug "Node Text Id" tId
194 (_masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster (Left "") ctype
195 ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
196 userListId <- flowList userId userCorpusId ngs
197 printDebug "userListId" userListId
199 _ <- mkDashboard userCorpusId userId
200 _ <- mkGraph userCorpusId userId
201 _ <- mkPhylo userCorpusId userId
206 -- _ <- mkAnnuaire rootUserId userId
210 insertMasterDocs :: ( FlowCmdM env err m
214 => Maybe c -> TermType Lang -> [a] -> m [DocId]
215 insertMasterDocs c lang hs = do
216 (masterUserId, _, masterCorpusId) <- getOrMkRootWithCorpus userMaster (Left corpusMasterName) c
218 -- TODO Type NodeDocumentUnicised
219 let hs' = map addUniqId hs
220 ids <- insertDb masterUserId masterCorpusId hs'
221 let documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' hs')
224 fixLang (Unsupervised l n s m) = Unsupervised l n s m'
227 Nothing -> trace ("buildTries here" :: String)
229 $ buildTries n ( fmap toToken $ uniText
230 $ Text.intercalate " . "
232 $ map hasText documentsWithId
238 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
239 maps <- mapNodeIdNgrams <$> documentIdWithNgrams (extractNgramsT lang') documentsWithId
240 terms2id <- insertNgrams $ Map.keys maps
241 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) maps
243 lId <- getOrMkList masterCorpusId masterUserId
244 _ <- insertDocNgrams lId indexedNgrams
249 type CorpusName = Text
251 getOrMkRootWithCorpus :: (HasNodeError err, MkCorpus a)
252 => Username -> Either CorpusName [CorpusId] -> Maybe a
253 -> Cmd err (UserId, RootId, CorpusId)
254 getOrMkRootWithCorpus username cName c = do
255 maybeUserId <- getUser username
256 userId <- case maybeUserId of
257 Nothing -> nodeError NoUserFound
258 Just user -> pure $ userLight_id user
260 rootId' <- map _node_id <$> getRoot username
262 rootId'' <- case rootId' of
263 [] -> mkRoot username userId
264 n -> case length n >= 2 of
265 True -> nodeError ManyNodeUsers
266 False -> pure rootId'
268 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
270 corpusId'' <- if username == userMaster
272 ns <- getCorporaWithParentId rootId
273 pure $ map _node_id ns
275 pure $ fromRight [] cName
277 corpusId' <- if corpusId'' /= []
279 else mk (Just $ fromLeft "Default" cName) c rootId userId
281 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
283 pure (userId, rootId, corpusId)
286 ------------------------------------------------------------------------
289 viewUniqId' :: UniqId a => a -> (HashId, a)
290 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
292 err = panic "[ERROR] Database.Flow.toInsert"
295 toInserted :: [ReturnId] -> Map HashId ReturnId
296 toInserted = Map.fromList . map (\r -> (reUniqId r, r) )
297 . filter (\r -> reInserted r == True)
299 data DocumentWithId a = DocumentWithId
300 { documentId :: !NodeId
304 instance HasText a => HasText (DocumentWithId a)
306 hasText (DocumentWithId _ a) = hasText a
308 mergeData :: Map HashId ReturnId
310 -> [DocumentWithId a]
311 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
313 toDocumentWithId (hash,hpd) =
314 DocumentWithId <$> fmap reId (lookup hash rs)
317 ------------------------------------------------------------------------
318 data DocumentIdWithNgrams a = DocumentIdWithNgrams
319 { documentWithId :: !(DocumentWithId a)
320 , document_ngrams :: !(Map Ngrams (Map NgramsType Int))
324 instance HasText HyperdataContact
328 instance ExtractNgramsT HyperdataContact
330 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
332 extract :: TermType Lang -> HyperdataContact
333 -> Cmd err (Map Ngrams (Map NgramsType Int))
335 let authors = map text2ngrams
336 $ maybe ["Nothing"] (\a -> [a])
337 $ view (hc_who . _Just . cw_lastName) hc'
339 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
341 instance HasText HyperdataDocument
343 hasText h = catMaybes [ _hyperdataDocument_title h
344 , _hyperdataDocument_abstract h
347 instance ExtractNgramsT HyperdataDocument
349 extractNgramsT :: TermType Lang -> HyperdataDocument -> Cmd err (Map Ngrams (Map NgramsType Int))
350 extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
352 extractNgramsT' :: TermType Lang -> HyperdataDocument
353 -> Cmd err (Map Ngrams (Map NgramsType Int))
354 extractNgramsT' lang' doc = do
355 let source = text2ngrams
356 $ maybe "Nothing" identity
357 $ _hyperdataDocument_source doc
359 institutes = map text2ngrams
360 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
361 $ _hyperdataDocument_institutes doc
363 authors = map text2ngrams
364 $ maybe ["Nothing"] (splitOn ", ")
365 $ _hyperdataDocument_authors doc
367 terms' <- map text2ngrams
368 <$> map (intercalate " " . _terms_label)
370 <$> liftIO (extractTerms lang' $ hasText doc)
372 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
373 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
374 <> [(a', Map.singleton Authors 1) | a' <- authors ]
375 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
378 filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
379 -> Map Ngrams (Map NgramsType Int)
380 filterNgramsT s ms = Map.fromList $ map (\a -> filter' s a) $ Map.toList ms
382 filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
384 False -> (Ngrams (Text.take s' t) n , y)
387 documentIdWithNgrams :: HasNodeError err
389 -> Cmd err (Map Ngrams (Map NgramsType Int)))
390 -> [DocumentWithId a]
391 -> Cmd err [DocumentIdWithNgrams a]
392 documentIdWithNgrams f = mapM toDocumentIdWithNgrams
394 toDocumentIdWithNgrams d = do
395 e <- f $ documentData d
396 pure $ DocumentIdWithNgrams d e
400 -- | TODO check optimization
401 mapNodeIdNgrams :: [DocumentIdWithNgrams a]
402 -> Map Ngrams (Map NgramsType (Map NodeId Int))
403 mapNodeIdNgrams = Map.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
405 f :: DocumentIdWithNgrams a
406 -> Map Ngrams (Map NgramsType (Map NodeId Int))
407 f d = fmap (fmap (Map.singleton nId)) $ document_ngrams d
409 nId = documentId $ documentWithId d
411 ------------------------------------------------------------------------
412 listInsert :: FlowCmdM env err m
413 => ListId -> Map NgramsType [NgramsElement]
415 listInsert lId ngs = mapM_ (\(typeList, ngElmts)
416 -> putListNgrams lId typeList ngElmts
419 flowList :: FlowCmdM env err m => UserId -> CorpusId
420 -> Map NgramsType [NgramsElement]
422 flowList uId cId ngs = do
423 lId <- getOrMkList cId uId
424 printDebug "listId flowList" lId
426 --trace (show $ List.filter (\n -> _ne_ngrams n == "versatile") $ List.concat $ Map.elems ngs) $ listInsert lId ngs