2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
20 {-# LANGUAGE ConstraintKinds #-}
21 {-# LANGUAGE RankNTypes #-}
22 {-# LANGUAGE ConstrainedClassMethods #-}
23 {-# LANGUAGE ConstraintKinds #-}
24 {-# LANGUAGE DeriveGeneric #-}
25 {-# LANGUAGE FlexibleContexts #-}
26 {-# LANGUAGE InstanceSigs #-}
27 {-# LANGUAGE NoImplicitPrelude #-}
28 {-# LANGUAGE OverloadedStrings #-}
30 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
34 , flowCorpusSearchInDatabase
36 , getOrMkRootWithCorpus
40 import Prelude (String)
42 import Debug.Trace (trace)
43 import Control.Lens ((^.), view, _Just)
44 import Control.Monad (mapM_)
45 import Control.Monad.IO.Class (liftIO)
46 import Data.List (concat)
47 import Data.Map (Map, lookup, toList)
48 import Data.Maybe (Maybe(..), catMaybes)
50 import Data.Text (Text, splitOn, intercalate)
51 import GHC.Show (Show)
52 import Gargantext.API.Ngrams (HasRepoVar)
53 import Gargantext.API.Ngrams (NgramsElement, putListNgrams, RepoCmdM)
54 import Gargantext.Core (Lang(..))
55 import Gargantext.Core.Types (NodePoly(..), Terms(..))
56 import Gargantext.Core.Types.Individu (Username)
57 import Gargantext.Core.Flow
58 import Gargantext.Core.Types.Main
59 import Gargantext.Database.Config (userMaster, corpusMasterName)
60 import Gargantext.Database.Flow.Utils (insertDocNgrams)
61 import Gargantext.Database.Node.Contact -- (HyperdataContact(..), ContactWho(..))
62 import Gargantext.Database.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
63 import Gargantext.Database.Root (getRoot)
64 import Gargantext.Database.Schema.Ngrams -- (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
65 import Gargantext.Database.Schema.Node -- (mkRoot, mkCorpus, getOrMkList, mkGraph, {-mkPhylo,-} mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
66 import Gargantext.Database.Schema.User (getUser, UserLight(..))
67 import Gargantext.Database.TextSearch (searchInDatabase)
68 import Gargantext.Database.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
69 import Gargantext.Database.Utils (Cmd, CmdM)
70 import Gargantext.Ext.IMT (toSchoolName)
71 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
72 import Gargantext.Prelude
73 import Gargantext.Text.Terms.Eleve (buildTries, toToken)
74 import Gargantext.Text.List (buildNgramsLists,StopSize(..))
75 import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat)
76 import qualified Gargantext.Text.Corpus.API.Isidore as Isidore
77 import Gargantext.Text.Terms (TermType(..), tt_lang, extractTerms, uniText)
78 import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
79 import Gargantext.Prelude.Utils hiding (sha)
80 import System.FilePath (FilePath)
81 import qualified Data.List as List
82 import qualified Data.Map as Map
83 import qualified Data.Text as Text
84 import qualified Gargantext.Database.Node.Document.Add as Doc (add)
85 import qualified Gargantext.Text.Corpus.Parsers.GrandDebat as GD
87 type FlowCmdM env err m =
94 ------------------------------------------------------------------------
96 data ApiQuery = ApiIsidoreQuery Text | ApiIsidoreAuth Text
102 -> IO [HyperdataDocument]
103 getDataApi lang limit (ApiIsidoreQuery q) = Isidore.get lang limit (Just q) Nothing
104 getDataApi lang limit (ApiIsidoreAuth q) = Isidore.get lang limit Nothing (Just q)
108 _flowCorpusApi :: ( FlowCmdM env err m)
109 => Username -> Either CorpusName [CorpusId]
114 _flowCorpusApi u n tt l q = do
115 docs <- liftIO $ splitEvery 500 <$> getDataApi (_tt_lang tt) l q
116 flowCorpus u n tt docs
118 ------------------------------------------------------------------------
120 flowAnnuaire :: FlowCmdM env err m
122 -> Either CorpusName [CorpusId]
126 flowAnnuaire u n l filePath = do
127 docs <- liftIO $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
128 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
131 _flowCorpusDebat :: FlowCmdM env err m
132 => Username -> Either CorpusName [CorpusId]
135 _flowCorpusDebat u n l fp = do
136 docs <- liftIO ( splitEvery 500
139 :: IO [[GD.GrandDebatReference ]]
141 flowCorpus u n (Multi FR) (map (map toHyperdataDocument) docs)
143 flowCorpusFile :: FlowCmdM env err m
144 => Username -> Either CorpusName [CorpusId]
145 -> Limit -- Limit the number of docs (for dev purpose)
146 -> TermType Lang -> FileFormat -> FilePath
148 flowCorpusFile u n l la ff fp = do
149 docs <- liftIO ( splitEvery 500
153 flowCorpus u n la (map (map toHyperdataDocument) docs)
155 -- TODO query with complex query
156 flowCorpusSearchInDatabase :: FlowCmdM env err m
161 flowCorpusSearchInDatabase u la q = do
162 (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus
165 (Nothing :: Maybe HyperdataCorpus)
166 ids <- map fst <$> searchInDatabase cId (stemIt q)
167 flowCorpusUser la u (Left q) (Nothing :: Maybe HyperdataCorpus) ids
171 _flowCorpusSearchInDatabaseApi :: FlowCmdM env err m
176 _flowCorpusSearchInDatabaseApi u la q = do
177 (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus
180 (Nothing :: Maybe HyperdataCorpus)
181 ids <- map fst <$> searchInDatabase cId (stemIt q)
182 flowCorpusUser la u (Left q) (Nothing :: Maybe HyperdataCorpus) ids
184 ------------------------------------------------------------------------
185 -- | TODO improve the needed type to create/update a corpus
187 data UserInfo = Username Text
189 data CorpusInfo = CorpusName Lang Text
190 | CorpusId Lang NodeId
193 flow :: (FlowCmdM env err m, FlowCorpus a, MkCorpus c)
196 -> Either CorpusName [CorpusId]
200 flow c u cn la docs = do
201 ids <- mapM (insertMasterDocs c la ) docs
202 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
204 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
206 -> Either CorpusName [CorpusId]
210 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
212 ------------------------------------------------------------------------
213 flowCorpusUser :: (FlowCmdM env err m, MkCorpus c)
216 -> Either CorpusName [CorpusId]
220 flowCorpusUser l userName corpusName ctype ids = do
222 (userId, _rootId, userCorpusId) <- getOrMkRootWithCorpus userName corpusName ctype
223 -- TODO: check if present already, ignore
224 _ <- Doc.add userCorpusId ids
225 tId <- mkNode NodeTexts userCorpusId userId
227 printDebug "Node Text Id" tId
231 (_masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster (Left "") ctype
232 ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
233 userListId <- flowList userId userCorpusId ngs
234 printDebug "userListId" userListId
236 _ <- mkDashboard userCorpusId userId
237 _ <- mkGraph userCorpusId userId
238 --_ <- mkPhylo userCorpusId userId
243 -- _ <- mkAnnuaire rootUserId userId
247 insertMasterDocs :: ( FlowCmdM env err m
255 insertMasterDocs c lang hs = do
256 (masterUserId, _, masterCorpusId) <- getOrMkRootWithCorpus userMaster (Left corpusMasterName) c
258 -- TODO Type NodeDocumentUnicised
259 let hs' = map addUniqId hs
260 ids <- insertDb masterUserId masterCorpusId hs'
261 let documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' hs')
264 fixLang (Unsupervised l n s m) = Unsupervised l n s m'
267 Nothing -> trace ("buildTries here" :: String)
269 $ buildTries n ( fmap toToken $ uniText
270 $ Text.intercalate " . "
272 $ map hasText documentsWithId
278 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
279 maps <- mapNodeIdNgrams <$> documentIdWithNgrams (extractNgramsT lang') documentsWithId
280 terms2id <- insertNgrams $ Map.keys maps
281 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) maps
283 lId <- getOrMkList masterCorpusId masterUserId
284 _ <- insertDocNgrams lId indexedNgrams
289 type CorpusName = Text
292 getOrMkRoot :: (HasNodeError err)
294 -> Cmd err (UserId, RootId)
295 getOrMkRoot username = do
296 maybeUserId <- getUser username
297 userId <- case maybeUserId of
298 Nothing -> nodeError NoUserFound
299 Just user -> pure $ userLight_id user
301 rootId' <- map _node_id <$> getRoot username
303 rootId'' <- case rootId' of
304 [] -> mkRoot username userId
305 n -> case length n >= 2 of
306 True -> nodeError ManyNodeUsers
307 False -> pure rootId'
309 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
310 pure (userId, rootId)
313 getOrMkRootWithCorpus :: (HasNodeError err, MkCorpus a)
315 -> Either CorpusName [CorpusId]
317 -> Cmd err (UserId, RootId, CorpusId)
318 getOrMkRootWithCorpus username cName c = do
319 (userId, rootId) <- getOrMkRoot username
320 corpusId'' <- if username == userMaster
322 ns <- getCorporaWithParentId rootId
323 pure $ map _node_id ns
325 pure $ fromRight [] cName
327 corpusId' <- if corpusId'' /= []
329 else mk (Just $ fromLeft "Default" cName) c rootId userId
331 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
333 pure (userId, rootId, corpusId)
336 ------------------------------------------------------------------------
337 viewUniqId' :: UniqId a
340 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
342 err = panic "[ERROR] Database.Flow.toInsert"
345 toInserted :: [ReturnId]
346 -> Map HashId ReturnId
348 Map.fromList . map (\r -> (reUniqId r, r) )
349 . filter (\r -> reInserted r == True)
351 data DocumentWithId a = DocumentWithId
352 { documentId :: !NodeId
356 instance HasText a => HasText (DocumentWithId a)
358 hasText (DocumentWithId _ a) = hasText a
360 mergeData :: Map HashId ReturnId
362 -> [DocumentWithId a]
363 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
365 toDocumentWithId (sha,hpd) =
366 DocumentWithId <$> fmap reId (lookup sha rs)
369 ------------------------------------------------------------------------
370 data DocumentIdWithNgrams a = DocumentIdWithNgrams
371 { documentWithId :: !(DocumentWithId a)
372 , document_ngrams :: !(Map Ngrams (Map NgramsType Int))
376 instance HasText HyperdataContact
380 instance ExtractNgramsT HyperdataContact
382 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
384 extract :: TermType Lang -> HyperdataContact
385 -> Cmd err (Map Ngrams (Map NgramsType Int))
387 let authors = map text2ngrams
388 $ maybe ["Nothing"] (\a -> [a])
389 $ view (hc_who . _Just . cw_lastName) hc'
391 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
393 instance HasText HyperdataDocument
395 hasText h = catMaybes [ _hyperdataDocument_title h
396 , _hyperdataDocument_abstract h
399 instance ExtractNgramsT HyperdataDocument
401 extractNgramsT :: TermType Lang
403 -> Cmd err (Map Ngrams (Map NgramsType Int))
404 extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
406 extractNgramsT' :: TermType Lang
408 -> Cmd err (Map Ngrams (Map NgramsType Int))
409 extractNgramsT' lang' doc = do
410 let source = text2ngrams
411 $ maybe "Nothing" identity
412 $ _hyperdataDocument_source doc
414 institutes = map text2ngrams
415 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
416 $ _hyperdataDocument_institutes doc
418 authors = map text2ngrams
419 $ maybe ["Nothing"] (splitOn ", ")
420 $ _hyperdataDocument_authors doc
422 terms' <- map text2ngrams
423 <$> map (intercalate " " . _terms_label)
425 <$> liftIO (extractTerms lang' $ hasText doc)
427 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
428 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
429 <> [(a', Map.singleton Authors 1) | a' <- authors ]
430 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
432 filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
433 -> Map Ngrams (Map NgramsType Int)
434 filterNgramsT s ms = Map.fromList $ map (\a -> filter' s a) $ Map.toList ms
436 filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
438 False -> (Ngrams (Text.take s' t) n , y)
441 documentIdWithNgrams :: HasNodeError err
443 -> Cmd err (Map Ngrams (Map NgramsType Int)))
444 -> [DocumentWithId a]
445 -> Cmd err [DocumentIdWithNgrams a]
446 documentIdWithNgrams f = mapM toDocumentIdWithNgrams
448 toDocumentIdWithNgrams d = do
449 e <- f $ documentData d
450 pure $ DocumentIdWithNgrams d e
454 -- | TODO check optimization
455 mapNodeIdNgrams :: [DocumentIdWithNgrams a]
456 -> Map Ngrams (Map NgramsType (Map NodeId Int))
457 mapNodeIdNgrams = Map.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
459 f :: DocumentIdWithNgrams a
460 -> Map Ngrams (Map NgramsType (Map NodeId Int))
461 f d = fmap (fmap (Map.singleton nId)) $ document_ngrams d
463 nId = documentId $ documentWithId d
465 ------------------------------------------------------------------------
466 listInsert :: FlowCmdM env err m
468 -> Map NgramsType [NgramsElement]
470 listInsert lId ngs = mapM_ (\(typeList, ngElmts)
471 -> putListNgrams lId typeList ngElmts
474 flowList :: FlowCmdM env err m
477 -> Map NgramsType [NgramsElement]
479 flowList uId cId ngs = do
480 lId <- getOrMkList cId uId
481 printDebug "listId flowList" lId
483 --trace (show $ List.filter (\n -> _ne_ngrams n == "versatile") $ List.concat $ Map.elems ngs) $ listInsert lId ngs