2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 -- check userId CanFillUserCorpus userCorpusId
13 -- check masterUserId CanFillMasterCorpus masterCorpusId
15 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
16 -- TODO-EVENTS: InsertedNodes
21 {-# OPTIONS_GHC -fno-warn-orphans #-}
23 {-# LANGUAGE ConstraintKinds #-}
24 {-# LANGUAGE RankNTypes #-}
25 {-# LANGUAGE ConstrainedClassMethods #-}
26 {-# LANGUAGE ConstraintKinds #-}
27 {-# LANGUAGE DeriveGeneric #-}
28 {-# LANGUAGE FlexibleContexts #-}
29 {-# LANGUAGE InstanceSigs #-}
30 {-# LANGUAGE NoImplicitPrelude #-}
31 {-# LANGUAGE OverloadedStrings #-}
33 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
35 import Prelude (String)
37 import Debug.Trace (trace)
38 import Control.Lens ((^.), view, _Just)
39 import Control.Monad (mapM_)
40 import Control.Monad.IO.Class (liftIO)
41 import Data.List (concat)
42 import Data.Map (Map, lookup, toList)
43 import Data.Maybe (Maybe(..), catMaybes)
45 import Data.Text (Text, splitOn, intercalate)
46 import GHC.Show (Show)
47 import Gargantext.API.Ngrams (HasRepoVar)
48 import Gargantext.API.Ngrams (NgramsElement(..), putListNgrams, RepoCmdM)
49 import Gargantext.Core (Lang(..))
50 import Gargantext.Core.Types (NodePoly(..), Terms(..))
51 import Gargantext.Core.Types.Individu (Username)
52 import Gargantext.Core.Flow
53 import Gargantext.Core.Types.Main
54 import Gargantext.Database.Config (userMaster, corpusMasterName)
55 import Gargantext.Database.Flow.Utils (insertDocNgrams)
56 import Gargantext.Database.Node.Contact -- (HyperdataContact(..), ContactWho(..))
57 import Gargantext.Database.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
58 import Gargantext.Database.Root (getRoot)
59 import Gargantext.Database.Schema.Ngrams -- (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
60 import Gargantext.Database.Schema.Node -- (mkRoot, mkCorpus, getOrMkList, mkGraph, {-mkPhylo,-} mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
61 import Gargantext.Database.Schema.User (getUser, UserLight(..))
62 import Gargantext.Database.TextSearch (searchInDatabase)
63 import Gargantext.Database.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
64 import Gargantext.Database.Utils (Cmd, CmdM)
65 import Gargantext.Ext.IMT (toSchoolName)
66 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
67 import Gargantext.Prelude
68 import Gargantext.Text.Terms.Eleve (buildTries, toToken)
69 import Gargantext.Text.List (buildNgramsLists,StopSize(..))
70 import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat)
71 import qualified Gargantext.Text.Corpus.API.Isidore as Isidore
72 import Gargantext.Text.Terms (TermType(..), tt_lang, extractTerms, uniText)
73 import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
74 import Gargantext.Prelude.Utils hiding (hash)
75 import System.FilePath (FilePath)
76 import qualified Data.List as List
77 import qualified Data.Map as Map
78 import qualified Data.Text as Text
79 import qualified Gargantext.Database.Node.Document.Add as Doc (add)
80 import qualified Gargantext.Text.Corpus.Parsers.GrandDebat as GD
82 type FlowCmdM env err m =
89 ------------------------------------------------------------------------
91 data ApiQuery = ApiIsidoreQuery Text | ApiIsidoreAuth Text
97 -> IO [HyperdataDocument]
98 getDataApi lang limit (ApiIsidoreQuery q) = Isidore.get lang limit (Just q) Nothing
99 getDataApi lang limit (ApiIsidoreAuth q) = Isidore.get lang limit Nothing (Just q)
102 flowCorpusApi :: ( FlowCmdM env err m)
103 => Username -> Either CorpusName [CorpusId]
108 flowCorpusApi u n tt l q = do
109 docs <- liftIO $ splitEvery 500 <$> getDataApi (_tt_lang tt) l q
110 flowCorpus u n tt docs
112 ------------------------------------------------------------------------
114 flowAnnuaire :: FlowCmdM env err m
115 => Username -> Either CorpusName [CorpusId] -> (TermType Lang) -> FilePath -> m AnnuaireId
116 flowAnnuaire u n l filePath = do
117 docs <- liftIO $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
118 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
121 flowCorpusDebat :: FlowCmdM env err m
122 => Username -> Either CorpusName [CorpusId]
125 flowCorpusDebat u n l fp = do
126 docs <- liftIO ( splitEvery 500
129 :: IO [[GD.GrandDebatReference ]]
131 flowCorpus u n (Multi FR) (map (map toHyperdataDocument) docs)
133 flowCorpusFile :: FlowCmdM env err m
134 => Username -> Either CorpusName [CorpusId]
135 -> Limit -- Limit the number of docs (for dev purpose)
136 -> TermType Lang -> FileFormat -> FilePath
138 flowCorpusFile u n l la ff fp = do
139 docs <- liftIO ( splitEvery 500
143 flowCorpus u n la (map (map toHyperdataDocument) docs)
145 -- TODO query with complex query
146 flowCorpusSearchInDatabase :: FlowCmdM env err m
147 => Username -> Lang -> Text -> m CorpusId
148 flowCorpusSearchInDatabase u la q = do
149 (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus userMaster (Left "") (Nothing :: Maybe HyperdataCorpus)
150 ids <- map fst <$> searchInDatabase cId (stemIt q)
151 flowCorpusUser la u (Left q) (Nothing :: Maybe HyperdataCorpus) ids
154 flowCorpusSearchInDatabaseApi :: FlowCmdM env err m
155 => Username -> Lang -> Text -> m CorpusId
156 flowCorpusSearchInDatabaseApi u la q = do
157 (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus userMaster (Left "") (Nothing :: Maybe HyperdataCorpus)
158 ids <- map fst <$> searchInDatabase cId (stemIt q)
159 flowCorpusUser la u (Left q) (Nothing :: Maybe HyperdataCorpus) ids
161 ------------------------------------------------------------------------
162 -- | TODO improve the needed type to create/update a corpus
163 data UserInfo = Username Text
165 data CorpusInfo = CorpusName Lang Text
166 | CorpusId Lang NodeId
169 flow :: (FlowCmdM env err m, FlowCorpus a, MkCorpus c)
170 => Maybe c -> Username -> Either CorpusName [CorpusId] -> TermType Lang -> [[a]] -> m CorpusId
171 flow c u cn la docs = do
172 ids <- mapM (insertMasterDocs c la ) docs
173 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
175 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
176 => Username -> Either CorpusName [CorpusId] -> TermType Lang -> [[a]] -> m CorpusId
177 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
179 ------------------------------------------------------------------------
180 flowCorpusUser :: (FlowCmdM env err m, MkCorpus c)
181 => Lang -> Username -> Either CorpusName [CorpusId] -> Maybe c -> [NodeId] -> m CorpusId
182 flowCorpusUser l userName corpusName ctype ids = do
184 (userId, _rootId, userCorpusId) <- getOrMkRootWithCorpus userName corpusName ctype
185 -- TODO: check if present already, ignore
186 _ <- Doc.add userCorpusId ids
187 tId <- mkNode NodeTexts userCorpusId userId
189 printDebug "Node Text Id" tId
193 (_masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster (Left "") ctype
194 ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
195 userListId <- flowList userId userCorpusId ngs
196 printDebug "userListId" userListId
198 _ <- mkDashboard userCorpusId userId
199 _ <- mkGraph userCorpusId userId
200 --_ <- mkPhylo userCorpusId userId
205 -- _ <- mkAnnuaire rootUserId userId
209 insertMasterDocs :: ( FlowCmdM env err m
213 => Maybe c -> TermType Lang -> [a] -> m [DocId]
214 insertMasterDocs c lang hs = do
215 (masterUserId, _, masterCorpusId) <- getOrMkRootWithCorpus userMaster (Left corpusMasterName) c
217 -- TODO Type NodeDocumentUnicised
218 let hs' = map addUniqId hs
219 ids <- insertDb masterUserId masterCorpusId hs'
220 let documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' hs')
223 fixLang (Unsupervised l n s m) = Unsupervised l n s m'
226 Nothing -> trace ("buildTries here" :: String)
228 $ buildTries n ( fmap toToken $ uniText
229 $ Text.intercalate " . "
231 $ map hasText documentsWithId
237 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
238 maps <- mapNodeIdNgrams <$> documentIdWithNgrams (extractNgramsT lang') documentsWithId
239 terms2id <- insertNgrams $ Map.keys maps
240 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) maps
242 lId <- getOrMkList masterCorpusId masterUserId
243 _ <- insertDocNgrams lId indexedNgrams
247 type CorpusName = Text
249 getOrMkRootWithCorpus :: (HasNodeError err, MkCorpus a)
250 => Username -> Either CorpusName [CorpusId] -> Maybe a
251 -> Cmd err (UserId, RootId, CorpusId)
252 getOrMkRootWithCorpus username cName c = do
253 maybeUserId <- getUser username
254 userId <- case maybeUserId of
255 Nothing -> nodeError NoUserFound
256 Just user -> pure $ userLight_id user
258 rootId' <- map _node_id <$> getRoot username
260 rootId'' <- case rootId' of
261 [] -> mkRoot username userId
262 n -> case length n >= 2 of
263 True -> nodeError ManyNodeUsers
264 False -> pure rootId'
266 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
268 corpusId'' <- if username == userMaster
270 ns <- getCorporaWithParentId rootId
271 pure $ map _node_id ns
273 pure $ fromRight [] cName
275 corpusId' <- if corpusId'' /= []
277 else mk (Just $ fromLeft "Default" cName) c rootId userId
279 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
281 pure (userId, rootId, corpusId)
284 ------------------------------------------------------------------------
285 viewUniqId' :: UniqId a => a -> (HashId, a)
286 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
288 err = panic "[ERROR] Database.Flow.toInsert"
291 toInserted :: [ReturnId] -> Map HashId ReturnId
292 toInserted = Map.fromList . map (\r -> (reUniqId r, r) )
293 . filter (\r -> reInserted r == True)
295 data DocumentWithId a = DocumentWithId
296 { documentId :: !NodeId
300 instance HasText a => HasText (DocumentWithId a)
302 hasText (DocumentWithId _ a) = hasText a
304 mergeData :: Map HashId ReturnId
306 -> [DocumentWithId a]
307 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
309 toDocumentWithId (hash,hpd) =
310 DocumentWithId <$> fmap reId (lookup hash rs)
313 ------------------------------------------------------------------------
314 data DocumentIdWithNgrams a = DocumentIdWithNgrams
315 { documentWithId :: !(DocumentWithId a)
316 , document_ngrams :: !(Map Ngrams (Map NgramsType Int))
320 instance HasText HyperdataContact
324 instance ExtractNgramsT HyperdataContact
326 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
328 extract :: TermType Lang -> HyperdataContact
329 -> Cmd err (Map Ngrams (Map NgramsType Int))
331 let authors = map text2ngrams
332 $ maybe ["Nothing"] (\a -> [a])
333 $ view (hc_who . _Just . cw_lastName) hc'
335 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
337 instance HasText HyperdataDocument
339 hasText h = catMaybes [ _hyperdataDocument_title h
340 , _hyperdataDocument_abstract h
343 instance ExtractNgramsT HyperdataDocument
345 extractNgramsT :: TermType Lang -> HyperdataDocument -> Cmd err (Map Ngrams (Map NgramsType Int))
346 extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
348 extractNgramsT' :: TermType Lang -> HyperdataDocument
349 -> Cmd err (Map Ngrams (Map NgramsType Int))
350 extractNgramsT' lang' doc = do
351 let source = text2ngrams
352 $ maybe "Nothing" identity
353 $ _hyperdataDocument_source doc
355 institutes = map text2ngrams
356 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
357 $ _hyperdataDocument_institutes doc
359 authors = map text2ngrams
360 $ maybe ["Nothing"] (splitOn ", ")
361 $ _hyperdataDocument_authors doc
363 terms' <- map text2ngrams
364 <$> map (intercalate " " . _terms_label)
366 <$> liftIO (extractTerms lang' $ hasText doc)
368 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
369 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
370 <> [(a', Map.singleton Authors 1) | a' <- authors ]
371 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
374 filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
375 -> Map Ngrams (Map NgramsType Int)
376 filterNgramsT s ms = Map.fromList $ map (\a -> filter' s a) $ Map.toList ms
378 filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
380 False -> (Ngrams (Text.take s' t) n , y)
383 documentIdWithNgrams :: HasNodeError err
385 -> Cmd err (Map Ngrams (Map NgramsType Int)))
386 -> [DocumentWithId a]
387 -> Cmd err [DocumentIdWithNgrams a]
388 documentIdWithNgrams f = mapM toDocumentIdWithNgrams
390 toDocumentIdWithNgrams d = do
391 e <- f $ documentData d
392 pure $ DocumentIdWithNgrams d e
396 -- | TODO check optimization
397 mapNodeIdNgrams :: [DocumentIdWithNgrams a]
398 -> Map Ngrams (Map NgramsType (Map NodeId Int))
399 mapNodeIdNgrams = Map.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
401 f :: DocumentIdWithNgrams a
402 -> Map Ngrams (Map NgramsType (Map NodeId Int))
403 f d = fmap (fmap (Map.singleton nId)) $ document_ngrams d
405 nId = documentId $ documentWithId d
407 ------------------------------------------------------------------------
408 listInsert :: FlowCmdM env err m
409 => ListId -> Map NgramsType [NgramsElement]
411 listInsert lId ngs = mapM_ (\(typeList, ngElmts)
412 -> putListNgrams lId typeList ngElmts
415 flowList :: FlowCmdM env err m => UserId -> CorpusId
416 -> Map NgramsType [NgramsElement]
418 flowList uId cId ngs = do
419 lId <- getOrMkList cId uId
420 printDebug "listId flowList" lId
422 --trace (show $ List.filter (\n -> _ne_ngrams n == "versatile") $ List.concat $ Map.elems ngs) $ listInsert lId ngs