2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
20 {-# LANGUAGE ConstraintKinds #-}
21 {-# LANGUAGE RankNTypes #-}
22 {-# LANGUAGE ConstrainedClassMethods #-}
23 {-# LANGUAGE ConstraintKinds #-}
24 {-# LANGUAGE DeriveGeneric #-}
25 {-# LANGUAGE FlexibleContexts #-}
26 {-# LANGUAGE InstanceSigs #-}
27 {-# LANGUAGE NoImplicitPrelude #-}
28 {-# LANGUAGE OverloadedStrings #-}
30 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
34 , flowCorpusSearchInDatabase
36 , getOrMkRootWithCorpus
40 import Prelude (String)
42 import Debug.Trace (trace)
43 import Control.Lens ((^.), view, _Just)
44 import Control.Monad (mapM_)
45 import Control.Monad.IO.Class (liftIO)
46 import Data.List (concat)
47 import Data.Map (Map, lookup, toList)
48 import Data.Maybe (Maybe(..), catMaybes)
50 import Data.Text (Text, splitOn, intercalate)
51 import GHC.Show (Show)
52 import Gargantext.API.Ngrams (HasRepoVar)
53 import Gargantext.API.Ngrams (NgramsElement, putListNgrams, RepoCmdM)
54 import Gargantext.Core (Lang(..))
55 import Gargantext.Core.Types (NodePoly(..), Terms(..))
56 import Gargantext.Core.Types.Individu (Username)
57 import Gargantext.Core.Flow
58 import Gargantext.Core.Types.Main
59 import Gargantext.Database.Config (userMaster, corpusMasterName)
60 import Gargantext.Database.Flow.Utils (insertDocNgrams)
61 import Gargantext.Database.Node.Contact -- (HyperdataContact(..), ContactWho(..))
62 import Gargantext.Database.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
63 import Gargantext.Database.Root (getRoot)
64 import Gargantext.Database.Schema.Ngrams -- (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
65 import Gargantext.Database.Schema.Node -- (mkRoot, mkCorpus, getOrMkList, mkGraph, {-mkPhylo,-} mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
66 import Gargantext.Database.Schema.User (getUser, UserLight(..))
67 import Gargantext.Database.TextSearch (searchInDatabase)
68 import Gargantext.Database.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
69 import Gargantext.Database.Utils (Cmd, CmdM)
70 import Gargantext.Database.Triggers
71 import Gargantext.Ext.IMT (toSchoolName)
72 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
73 import Gargantext.Prelude
74 import Gargantext.Text.Terms.Eleve (buildTries, toToken)
75 import Gargantext.Text.List (buildNgramsLists,StopSize(..))
76 import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat)
77 import qualified Gargantext.Text.Corpus.API.Isidore as Isidore
78 import Gargantext.Text.Terms (TermType(..), tt_lang, extractTerms, uniText)
79 import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
80 import Gargantext.Prelude.Utils hiding (sha)
81 import System.FilePath (FilePath)
82 import qualified Data.List as List
83 import qualified Data.Map as Map
84 import qualified Data.Text as Text
85 import qualified Gargantext.Database.Node.Document.Add as Doc (add)
86 import qualified Gargantext.Text.Corpus.Parsers.GrandDebat as GD
88 type FlowCmdM env err m =
95 ------------------------------------------------------------------------
97 data ApiQuery = ApiIsidoreQuery Text | ApiIsidoreAuth Text
103 -> IO [HyperdataDocument]
104 getDataApi lang limit (ApiIsidoreQuery q) = Isidore.get lang limit (Just q) Nothing
105 getDataApi lang limit (ApiIsidoreAuth q) = Isidore.get lang limit Nothing (Just q)
109 _flowCorpusApi :: ( FlowCmdM env err m)
110 => Username -> Either CorpusName [CorpusId]
115 _flowCorpusApi u n tt l q = do
116 docs <- liftIO $ splitEvery 500 <$> getDataApi (_tt_lang tt) l q
117 flowCorpus u n tt docs
119 ------------------------------------------------------------------------
121 flowAnnuaire :: FlowCmdM env err m
123 -> Either CorpusName [CorpusId]
127 flowAnnuaire u n l filePath = do
128 docs <- liftIO $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
129 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
132 _flowCorpusDebat :: FlowCmdM env err m
133 => Username -> Either CorpusName [CorpusId]
136 _flowCorpusDebat u n l fp = do
137 docs <- liftIO ( splitEvery 500
140 :: IO [[GD.GrandDebatReference ]]
142 flowCorpus u n (Multi FR) (map (map toHyperdataDocument) docs)
144 flowCorpusFile :: FlowCmdM env err m
145 => Username -> Either CorpusName [CorpusId]
146 -> Limit -- Limit the number of docs (for dev purpose)
147 -> TermType Lang -> FileFormat -> FilePath
149 flowCorpusFile u n l la ff fp = do
150 docs <- liftIO ( splitEvery 500
154 flowCorpus u n la (map (map toHyperdataDocument) docs)
156 -- TODO query with complex query
157 flowCorpusSearchInDatabase :: FlowCmdM env err m
162 flowCorpusSearchInDatabase u la q = do
163 (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus
166 (Nothing :: Maybe HyperdataCorpus)
167 ids <- map fst <$> searchInDatabase cId (stemIt q)
168 flowCorpusUser la u (Left q) (Nothing :: Maybe HyperdataCorpus) ids
172 _flowCorpusSearchInDatabaseApi :: FlowCmdM env err m
177 _flowCorpusSearchInDatabaseApi u la q = do
178 (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus
181 (Nothing :: Maybe HyperdataCorpus)
182 ids <- map fst <$> searchInDatabase cId (stemIt q)
183 flowCorpusUser la u (Left q) (Nothing :: Maybe HyperdataCorpus) ids
185 ------------------------------------------------------------------------
186 -- | TODO improve the needed type to create/update a corpus
188 data UserInfo = Username Text
190 data CorpusInfo = CorpusName Lang Text
191 | CorpusId Lang NodeId
194 flow :: (FlowCmdM env err m, FlowCorpus a, MkCorpus c)
197 -> Either CorpusName [CorpusId]
201 flow c u cn la docs = do
202 ids <- mapM (insertMasterDocs c la ) docs
203 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
205 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
207 -> Either CorpusName [CorpusId]
211 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
213 ------------------------------------------------------------------------
214 flowCorpusUser :: (FlowCmdM env err m, MkCorpus c)
217 -> Either CorpusName [CorpusId]
221 flowCorpusUser l userName corpusName ctype ids = do
223 (userId, _rootId, userCorpusId) <- getOrMkRootWithCorpus userName corpusName ctype
224 -- TODO: check if present already, ignore
225 _ <- Doc.add userCorpusId ids
226 tId <- mkNode NodeTexts userCorpusId userId
228 printDebug "Node Text Id" tId
232 (masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster (Left "") ctype
233 ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
234 userListId <- flowList userId userCorpusId ngs
235 mastListId <- getOrMkList masterCorpusId masterUserId
236 _ <- insertOccsUpdates userCorpusId mastListId
237 printDebug "userListId" userListId
239 _ <- mkDashboard userCorpusId userId
240 _ <- mkGraph userCorpusId userId
241 --_ <- mkPhylo userCorpusId userId
246 -- _ <- mkAnnuaire rootUserId userId
250 insertMasterDocs :: ( FlowCmdM env err m
258 insertMasterDocs c lang hs = do
259 (masterUserId, _, masterCorpusId) <- getOrMkRootWithCorpus userMaster (Left corpusMasterName) c
261 -- TODO Type NodeDocumentUnicised
262 let hs' = map addUniqId hs
263 ids <- insertDb masterUserId masterCorpusId hs'
264 let documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' hs')
267 fixLang (Unsupervised l n s m) = Unsupervised l n s m'
270 Nothing -> trace ("buildTries here" :: String)
272 $ buildTries n ( fmap toToken $ uniText
273 $ Text.intercalate " . "
275 $ map hasText documentsWithId
281 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
282 maps <- mapNodeIdNgrams <$> documentIdWithNgrams (extractNgramsT lang') documentsWithId
283 terms2id <- insertNgrams $ Map.keys maps
284 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) maps
286 lId <- getOrMkList masterCorpusId masterUserId
287 _ <- insertDocNgrams lId indexedNgrams
292 type CorpusName = Text
295 getOrMkRoot :: (HasNodeError err)
297 -> Cmd err (UserId, RootId)
298 getOrMkRoot username = do
299 maybeUserId <- getUser username
300 userId <- case maybeUserId of
301 Nothing -> nodeError NoUserFound
302 Just user -> pure $ userLight_id user
304 rootId' <- map _node_id <$> getRoot username
306 rootId'' <- case rootId' of
307 [] -> mkRoot username userId
308 n -> case length n >= 2 of
309 True -> nodeError ManyNodeUsers
310 False -> pure rootId'
312 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
313 pure (userId, rootId)
316 getOrMkRootWithCorpus :: (HasNodeError err, MkCorpus a)
318 -> Either CorpusName [CorpusId]
320 -> Cmd err (UserId, RootId, CorpusId)
321 getOrMkRootWithCorpus username cName c = do
322 (userId, rootId) <- getOrMkRoot username
323 corpusId'' <- if username == userMaster
325 ns <- getCorporaWithParentId rootId
326 pure $ map _node_id ns
328 pure $ fromRight [] cName
330 corpusId' <- if corpusId'' /= []
332 else mk (Just $ fromLeft "Default" cName) c rootId userId
334 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
336 pure (userId, rootId, corpusId)
339 ------------------------------------------------------------------------
340 viewUniqId' :: UniqId a
343 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
345 err = panic "[ERROR] Database.Flow.toInsert"
348 toInserted :: [ReturnId]
349 -> Map HashId ReturnId
351 Map.fromList . map (\r -> (reUniqId r, r) )
352 . filter (\r -> reInserted r == True)
354 data DocumentWithId a = DocumentWithId
355 { documentId :: !NodeId
359 instance HasText a => HasText (DocumentWithId a)
361 hasText (DocumentWithId _ a) = hasText a
363 mergeData :: Map HashId ReturnId
365 -> [DocumentWithId a]
366 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
368 toDocumentWithId (sha,hpd) =
369 DocumentWithId <$> fmap reId (lookup sha rs)
372 ------------------------------------------------------------------------
373 data DocumentIdWithNgrams a = DocumentIdWithNgrams
374 { documentWithId :: !(DocumentWithId a)
375 , document_ngrams :: !(Map Ngrams (Map NgramsType Int))
379 instance HasText HyperdataContact
383 instance ExtractNgramsT HyperdataContact
385 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
387 extract :: TermType Lang -> HyperdataContact
388 -> Cmd err (Map Ngrams (Map NgramsType Int))
390 let authors = map text2ngrams
391 $ maybe ["Nothing"] (\a -> [a])
392 $ view (hc_who . _Just . cw_lastName) hc'
394 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
396 instance HasText HyperdataDocument
398 hasText h = catMaybes [ _hyperdataDocument_title h
399 , _hyperdataDocument_abstract h
402 instance ExtractNgramsT HyperdataDocument
404 extractNgramsT :: TermType Lang
406 -> Cmd err (Map Ngrams (Map NgramsType Int))
407 extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
409 extractNgramsT' :: TermType Lang
411 -> Cmd err (Map Ngrams (Map NgramsType Int))
412 extractNgramsT' lang' doc = do
413 let source = text2ngrams
414 $ maybe "Nothing" identity
415 $ _hyperdataDocument_source doc
417 institutes = map text2ngrams
418 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
419 $ _hyperdataDocument_institutes doc
421 authors = map text2ngrams
422 $ maybe ["Nothing"] (splitOn ", ")
423 $ _hyperdataDocument_authors doc
425 terms' <- map text2ngrams
426 <$> map (intercalate " " . _terms_label)
428 <$> liftIO (extractTerms lang' $ hasText doc)
430 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
431 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
432 <> [(a', Map.singleton Authors 1) | a' <- authors ]
433 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
435 filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
436 -> Map Ngrams (Map NgramsType Int)
437 filterNgramsT s ms = Map.fromList $ map (\a -> filter' s a) $ Map.toList ms
439 filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
441 False -> (Ngrams (Text.take s' t) n , y)
444 documentIdWithNgrams :: HasNodeError err
446 -> Cmd err (Map Ngrams (Map NgramsType Int)))
447 -> [DocumentWithId a]
448 -> Cmd err [DocumentIdWithNgrams a]
449 documentIdWithNgrams f = mapM toDocumentIdWithNgrams
451 toDocumentIdWithNgrams d = do
452 e <- f $ documentData d
453 pure $ DocumentIdWithNgrams d e
457 -- | TODO check optimization
458 mapNodeIdNgrams :: [DocumentIdWithNgrams a]
459 -> Map Ngrams (Map NgramsType (Map NodeId Int))
460 mapNodeIdNgrams = Map.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
462 f :: DocumentIdWithNgrams a
463 -> Map Ngrams (Map NgramsType (Map NodeId Int))
464 f d = fmap (fmap (Map.singleton nId)) $ document_ngrams d
466 nId = documentId $ documentWithId d
468 ------------------------------------------------------------------------
469 listInsert :: FlowCmdM env err m
471 -> Map NgramsType [NgramsElement]
473 listInsert lId ngs = mapM_ (\(typeList, ngElmts)
474 -> putListNgrams lId typeList ngElmts
477 flowList :: FlowCmdM env err m
480 -> Map NgramsType [NgramsElement]
482 flowList uId cId ngs = do
483 lId <- getOrMkList cId uId
484 printDebug "listId flowList" lId
486 --trace (show $ List.filter (\n -> _ne_ngrams n == "versatile") $ List.concat $ Map.elems ngs) $ listInsert lId ngs