2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
20 {-# LANGUAGE ConstraintKinds #-}
21 {-# LANGUAGE RankNTypes #-}
22 {-# LANGUAGE ConstrainedClassMethods #-}
23 {-# LANGUAGE ConstraintKinds #-}
24 {-# LANGUAGE DeriveGeneric #-}
25 {-# LANGUAGE FlexibleContexts #-}
26 {-# LANGUAGE InstanceSigs #-}
27 {-# LANGUAGE NoImplicitPrelude #-}
28 {-# LANGUAGE OverloadedStrings #-}
30 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
34 , flowCorpusSearchInDatabase
36 , getOrMkRootWithCorpus
40 import Prelude (String)
42 import Debug.Trace (trace)
43 import Control.Lens ((^.), view, _Just)
44 import Control.Monad (mapM_)
45 import Control.Monad.IO.Class (liftIO)
46 import Data.List (concat)
47 import Data.Map (Map, lookup, toList)
48 import Data.Maybe (Maybe(..), catMaybes)
50 import Data.Text (Text, splitOn, intercalate)
51 import GHC.Show (Show)
52 import Gargantext.API.Ngrams (HasRepoVar)
53 import Gargantext.API.Ngrams (NgramsElement, putListNgrams, RepoCmdM)
54 import Gargantext.Core (Lang(..))
55 import Gargantext.Core.Types (NodePoly(..), Terms(..))
56 import Gargantext.Core.Types.Individu (Username)
57 import Gargantext.Core.Flow
58 import Gargantext.Core.Types.Main
59 import Gargantext.Database.Config (userMaster, corpusMasterName)
60 import Gargantext.Database.Flow.Utils (insertDocNgrams)
61 import Gargantext.Database.Node.Contact -- (HyperdataContact(..), ContactWho(..))
62 import Gargantext.Database.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
63 import Gargantext.Database.Root (getRoot)
64 import Gargantext.Database.Schema.Ngrams -- (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
65 import Gargantext.Database.Schema.Node -- (mkRoot, mkCorpus, getOrMkList, mkGraph, {-mkPhylo,-} mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
66 import Gargantext.Database.Schema.User (getUser, UserLight(..))
67 import Gargantext.Database.TextSearch (searchInDatabase)
68 import Gargantext.Database.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
69 import Gargantext.Database.Utils (Cmd, CmdM)
70 import Gargantext.Ext.IMT (toSchoolName)
71 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
72 import Gargantext.Prelude
73 import Gargantext.Text.Terms.Eleve (buildTries, toToken)
74 import Gargantext.Text.List (buildNgramsLists,StopSize(..))
75 import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat)
76 import qualified Gargantext.Text.Corpus.API.Isidore as Isidore
77 import Gargantext.Text.Terms (TermType(..), tt_lang, extractTerms, uniText)
78 import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
79 import Gargantext.Prelude.Utils hiding (sha)
80 import System.FilePath (FilePath)
81 import qualified Data.List as List
82 import qualified Data.Map as Map
83 import qualified Data.Text as Text
84 import qualified Gargantext.Database.Node.Document.Add as Doc (add)
85 import qualified Gargantext.Text.Corpus.Parsers.GrandDebat as GD
87 type FlowCmdM env err m =
94 ------------------------------------------------------------------------
96 data ApiQuery = ApiIsidoreQuery Text | ApiIsidoreAuth Text
102 -> IO [HyperdataDocument]
103 getDataApi lang limit (ApiIsidoreQuery q) = Isidore.get lang limit (Just q) Nothing
104 getDataApi lang limit (ApiIsidoreAuth q) = Isidore.get lang limit Nothing (Just q)
108 _flowCorpusApi :: ( FlowCmdM env err m)
109 => Username -> Either CorpusName [CorpusId]
114 _flowCorpusApi u n tt l q = do
115 docs <- liftIO $ splitEvery 500 <$> getDataApi (_tt_lang tt) l q
116 flowCorpus u n tt docs
118 ------------------------------------------------------------------------
120 flowAnnuaire :: FlowCmdM env err m
122 -> Either CorpusName [CorpusId]
126 flowAnnuaire u n l filePath = do
127 docs <- liftIO $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
128 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
131 _flowCorpusDebat :: FlowCmdM env err m
132 => Username -> Either CorpusName [CorpusId]
135 _flowCorpusDebat u n l fp = do
136 docs <- liftIO ( splitEvery 500
139 :: IO [[GD.GrandDebatReference ]]
141 flowCorpus u n (Multi FR) (map (map toHyperdataDocument) docs)
143 flowCorpusFile :: FlowCmdM env err m
144 => Username -> Either CorpusName [CorpusId]
145 -> Limit -- Limit the number of docs (for dev purpose)
146 -> TermType Lang -> FileFormat -> FilePath
148 flowCorpusFile u n l la ff fp = do
149 docs <- liftIO ( splitEvery 500
153 flowCorpus u n la (map (map toHyperdataDocument) docs)
155 -- TODO query with complex query
156 flowCorpusSearchInDatabase :: FlowCmdM env err m
161 flowCorpusSearchInDatabase u la q = do
162 (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus
165 (Nothing :: Maybe HyperdataCorpus)
166 ids <- map fst <$> searchInDatabase cId (stemIt q)
167 flowCorpusUser la u (Left q) (Nothing :: Maybe HyperdataCorpus) ids
171 _flowCorpusSearchInDatabaseApi :: FlowCmdM env err m
176 _flowCorpusSearchInDatabaseApi u la q = do
177 (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus
180 (Nothing :: Maybe HyperdataCorpus)
181 ids <- map fst <$> searchInDatabase cId (stemIt q)
182 flowCorpusUser la u (Left q) (Nothing :: Maybe HyperdataCorpus) ids
184 ------------------------------------------------------------------------
185 -- | TODO improve the needed type to create/update a corpus
187 data UserInfo = Username Text
189 data CorpusInfo = CorpusName Lang Text
190 | CorpusId Lang NodeId
193 flow :: (FlowCmdM env err m, FlowCorpus a, MkCorpus c)
196 -> Either CorpusName [CorpusId]
200 flow c u cn la docs = do
201 ids <- mapM (insertMasterDocs c la ) docs
202 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
204 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
206 -> Either CorpusName [CorpusId]
210 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
212 ------------------------------------------------------------------------
213 flowCorpusUser :: (FlowCmdM env err m, MkCorpus c)
216 -> Either CorpusName [CorpusId]
220 flowCorpusUser l userName corpusName ctype ids = do
222 (userId, _rootId, userCorpusId) <- getOrMkRootWithCorpus userName corpusName ctype
223 listId <- getOrMkList userCorpusId userId
224 -- TODO: check if present already, ignore
225 _ <- Doc.add userCorpusId ids
226 tId <- mkNode NodeTexts userCorpusId userId
228 printDebug "Node Text Id" tId
232 (_masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster (Left "") ctype
233 ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
234 userListId <- flowList listId ngs
235 --mastListId <- getOrMkList masterCorpusId masterUserId
236 -- _ <- insertOccsUpdates userCorpusId mastListId
237 printDebug "userListId" userListId
239 _ <- mkDashboard userCorpusId userId
240 _ <- mkGraph userCorpusId userId
241 --_ <- mkPhylo userCorpusId userId
246 -- _ <- mkAnnuaire rootUserId userId
250 insertMasterDocs :: ( FlowCmdM env err m
258 insertMasterDocs c lang hs = do
259 (masterUserId, _, masterCorpusId) <- getOrMkRootWithCorpus userMaster (Left corpusMasterName) c
261 -- TODO Type NodeDocumentUnicised
262 let hs' = map addUniqId hs
263 ids <- insertDb masterUserId masterCorpusId hs'
264 let documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' hs')
267 fixLang (Unsupervised l n s m) = Unsupervised l n s m'
270 Nothing -> trace ("buildTries here" :: String)
272 $ buildTries n ( fmap toToken $ uniText
273 $ Text.intercalate " . "
275 $ map hasText documentsWithId
281 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
282 maps <- mapNodeIdNgrams <$> documentIdWithNgrams (extractNgramsT lang') documentsWithId
283 terms2id <- insertNgrams $ Map.keys maps
284 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) maps
286 lId <- getOrMkList masterCorpusId masterUserId
287 _ <- insertDocNgrams lId indexedNgrams
292 type CorpusName = Text
295 getOrMkRoot :: (HasNodeError err)
297 -> Cmd err (UserId, RootId)
298 getOrMkRoot username = do
299 maybeUserId <- getUser username
300 userId <- case maybeUserId of
301 Nothing -> nodeError NoUserFound
302 Just user -> pure $ userLight_id user
304 rootId' <- map _node_id <$> getRoot username
306 rootId'' <- case rootId' of
307 [] -> mkRoot username userId
308 n -> case length n >= 2 of
309 True -> nodeError ManyNodeUsers
310 False -> pure rootId'
312 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
313 pure (userId, rootId)
316 getOrMkRootWithCorpus :: (HasNodeError err, MkCorpus a)
318 -> Either CorpusName [CorpusId]
320 -> Cmd err (UserId, RootId, CorpusId)
321 getOrMkRootWithCorpus username cName c = do
322 (userId, rootId) <- getOrMkRoot username
323 corpusId'' <- if username == userMaster
325 ns <- getCorporaWithParentId rootId
326 pure $ map _node_id ns
328 pure $ fromRight [] cName
330 corpusId' <- if corpusId'' /= []
332 else mk (Just $ fromLeft "Default" cName) c rootId userId
334 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
336 pure (userId, rootId, corpusId)
339 ------------------------------------------------------------------------
340 viewUniqId' :: UniqId a
343 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
345 err = panic "[ERROR] Database.Flow.toInsert"
348 toInserted :: [ReturnId]
349 -> Map HashId ReturnId
351 Map.fromList . map (\r -> (reUniqId r, r) )
352 . filter (\r -> reInserted r == True)
354 data DocumentWithId a = DocumentWithId
355 { documentId :: !NodeId
359 instance HasText a => HasText (DocumentWithId a)
361 hasText (DocumentWithId _ a) = hasText a
363 mergeData :: Map HashId ReturnId
365 -> [DocumentWithId a]
366 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
368 toDocumentWithId (sha,hpd) =
369 DocumentWithId <$> fmap reId (lookup sha rs)
372 ------------------------------------------------------------------------
373 data DocumentIdWithNgrams a = DocumentIdWithNgrams
374 { documentWithId :: !(DocumentWithId a)
375 , document_ngrams :: !(Map Ngrams (Map NgramsType Int))
379 instance HasText HyperdataContact
383 instance ExtractNgramsT HyperdataContact
385 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
387 extract :: TermType Lang -> HyperdataContact
388 -> Cmd err (Map Ngrams (Map NgramsType Int))
390 let authors = map text2ngrams
391 $ maybe ["Nothing"] (\a -> [a])
392 $ view (hc_who . _Just . cw_lastName) hc'
394 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
396 instance HasText HyperdataDocument
398 hasText h = catMaybes [ _hyperdataDocument_title h
399 , _hyperdataDocument_abstract h
402 instance ExtractNgramsT HyperdataDocument
404 extractNgramsT :: TermType Lang
406 -> Cmd err (Map Ngrams (Map NgramsType Int))
407 extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
409 extractNgramsT' :: TermType Lang
411 -> Cmd err (Map Ngrams (Map NgramsType Int))
412 extractNgramsT' lang' doc = do
413 let source = text2ngrams
414 $ maybe "Nothing" identity
415 $ _hyperdataDocument_source doc
417 institutes = map text2ngrams
418 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
419 $ _hyperdataDocument_institutes doc
421 authors = map text2ngrams
422 $ maybe ["Nothing"] (splitOn ", ")
423 $ _hyperdataDocument_authors doc
425 terms' <- map text2ngrams
426 <$> map (intercalate " " . _terms_label)
428 <$> liftIO (extractTerms lang' $ hasText doc)
430 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
431 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
432 <> [(a', Map.singleton Authors 1) | a' <- authors ]
433 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
435 filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
436 -> Map Ngrams (Map NgramsType Int)
437 filterNgramsT s ms = Map.fromList $ map (\a -> filter' s a) $ Map.toList ms
439 filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
441 False -> (Ngrams (Text.take s' t) n , y)
444 documentIdWithNgrams :: HasNodeError err
446 -> Cmd err (Map Ngrams (Map NgramsType Int)))
447 -> [DocumentWithId a]
448 -> Cmd err [DocumentIdWithNgrams a]
449 documentIdWithNgrams f = mapM toDocumentIdWithNgrams
451 toDocumentIdWithNgrams d = do
452 e <- f $ documentData d
453 pure $ DocumentIdWithNgrams d e
457 -- | TODO check optimization
458 mapNodeIdNgrams :: [DocumentIdWithNgrams a]
459 -> Map Ngrams (Map NgramsType (Map NodeId Int))
460 mapNodeIdNgrams = Map.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
462 f :: DocumentIdWithNgrams a
463 -> Map Ngrams (Map NgramsType (Map NodeId Int))
464 f d = fmap (fmap (Map.singleton nId)) $ document_ngrams d
466 nId = documentId $ documentWithId d
468 ------------------------------------------------------------------------
469 listInsert :: FlowCmdM env err m
471 -> Map NgramsType [NgramsElement]
473 listInsert lId ngs = mapM_ (\(typeList, ngElmts)
474 -> putListNgrams lId typeList ngElmts
477 flowList :: FlowCmdM env err m
479 -> Map NgramsType [NgramsElement]
481 flowList lId ngs = do
482 printDebug "listId flowList" lId
484 --trace (show $ List.filter (\n -> _ne_ngrams n == "versatile") $ List.concat $ Map.elems ngs) $ listInsert lId ngs