2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 -- check userId CanFillUserCorpus userCorpusId
13 -- check masterUserId CanFillMasterCorpus masterCorpusId
15 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
16 -- TODO-EVENTS: InsertedNodes
21 {-# LANGUAGE ConstraintKinds #-}
22 {-# LANGUAGE DeriveGeneric #-}
23 {-# LANGUAGE NoImplicitPrelude #-}
24 {-# LANGUAGE OverloadedStrings #-}
25 {-# LANGUAGE RankNTypes #-}
26 {-# LANGUAGE FlexibleContexts #-}
28 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
31 --import Debug.Trace (trace)
32 import Control.Lens ((^.), view, Lens', _Just)
33 import Control.Monad (mapM_)
34 import Control.Monad.IO.Class (liftIO)
35 import Data.List (concat)
36 import Data.Map (Map, lookup, toList)
37 import Data.Maybe (Maybe(..), catMaybes)
39 import Data.Text (Text, splitOn, intercalate)
40 import GHC.Show (Show)
41 import Gargantext.API.Ngrams (HasRepoVar)
42 import Gargantext.API.Ngrams (NgramsElement(..), putListNgrams, RepoCmdM)
43 import Gargantext.Core (Lang(..))
44 import Gargantext.Core.Types (NodePoly(..), Terms(..))
45 import Gargantext.Core.Types.Individu (Username)
46 import Gargantext.Core.Types.Main
47 import Gargantext.Database.Config (userMaster, corpusMasterName)
48 import Gargantext.Database.Flow.Utils (insertDocNgrams)
49 import Gargantext.Database.Node.Contact -- (HyperdataContact(..), ContactWho(..))
50 import Gargantext.Database.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
51 import Gargantext.Database.Root (getRoot)
52 import Gargantext.Database.Schema.Ngrams -- (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
53 import Gargantext.Database.Schema.Node -- (mkRoot, mkCorpus, getOrMkList, mkGraph, mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
54 import Gargantext.Database.Schema.User (getUser, UserLight(..))
55 import Gargantext.Database.TextSearch (searchInDatabase)
56 import Gargantext.Database.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
57 import Gargantext.Database.Utils (Cmd, CmdM)
58 import Gargantext.Ext.IMT (toSchoolName)
59 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
60 import Gargantext.Prelude
61 import Gargantext.Text.List (buildNgramsLists,StopSize(..))
62 import Gargantext.Text.Parsers (parseFile, FileFormat)
63 import Gargantext.Text.Terms (TermType(..), tt_lang)
64 import Gargantext.Text.Terms (extractTerms)
65 import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
66 import Servant (ServantErr)
67 import System.FilePath (FilePath)
68 --import qualified Data.List as List
69 import qualified Data.Map as Map
70 import qualified Data.Text as Text
71 import qualified Gargantext.Database.Node.Document.Add as Doc (add)
72 import qualified Gargantext.Text.Parsers.GrandDebat as GD
74 type FlowCmdM env err m =
81 type FlowCorpus a = ( AddUniqId a
87 ------------------------------------------------------------------------
89 flowAnnuaire :: FlowCmdM env ServantErr m
90 => Username -> CorpusName -> (TermType Lang) -> FilePath -> m AnnuaireId
91 flowAnnuaire u n l filePath = do
92 docs <- liftIO $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
93 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
96 flowCorpusDebat :: FlowCmdM env ServantErr m
97 => Username -> CorpusName
100 flowCorpusDebat u n l fp = do
101 docs <- liftIO ( splitEvery 500
104 :: IO [[GD.GrandDebatReference ]]
106 flowCorpus u n (Multi FR) (map (map toHyperdataDocument) docs)
109 flowCorpusFile :: FlowCmdM env ServantErr m
110 => Username -> CorpusName
111 -> Limit -- Limit the number of docs (for dev purpose)
112 -> TermType Lang -> FileFormat -> FilePath
114 flowCorpusFile u n l la ff fp = do
115 docs <- liftIO ( splitEvery 500
119 flowCorpus u n la (map (map toHyperdataDocument) docs)
121 -- TODO query with complex query
122 flowCorpusSearchInDatabase :: FlowCmdM env err m
123 => Username -> Lang -> Text -> m CorpusId
124 flowCorpusSearchInDatabase u la q = do
125 (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus userMaster "" (Nothing :: Maybe HyperdataCorpus)
126 ids <- map fst <$> searchInDatabase cId (stemIt q)
127 flowCorpusUser la u q (Nothing :: Maybe HyperdataCorpus) ids
130 flowCorpusSearchInDatabase' :: FlowCmdM env ServantErr m
131 => Username -> Lang -> Text -> m CorpusId
132 flowCorpusSearchInDatabase' u la q = do
133 (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus userMaster "" (Nothing :: Maybe HyperdataCorpus)
134 ids <- map fst <$> searchInDatabase cId (stemIt q)
135 flowCorpusUser la u q (Nothing :: Maybe HyperdataCorpus) ids
137 ------------------------------------------------------------------------
139 flow :: (FlowCmdM env ServantErr m, FlowCorpus a, MkCorpus c)
140 => Maybe c -> Username -> CorpusName -> TermType Lang -> [[a]] -> m CorpusId
141 flow c u cn la docs = do
142 ids <- mapM (insertMasterDocs c la ) docs
143 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
145 flowCorpus :: (FlowCmdM env ServantErr m, FlowCorpus a)
146 => Username -> CorpusName -> TermType Lang -> [[a]] -> m CorpusId
147 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
150 flowCorpusUser :: (FlowCmdM env err m, MkCorpus c)
151 => Lang -> Username -> CorpusName -> Maybe c -> [NodeId] -> m CorpusId
152 flowCorpusUser l userName corpusName ctype ids = do
154 (userId, _rootId, userCorpusId) <- getOrMkRootWithCorpus userName corpusName ctype
155 -- TODO: check if present already, ignore
156 _ <- Doc.add userCorpusId ids
160 (_masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster "" ctype
161 ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
162 userListId <- flowList userId userCorpusId ngs
163 printDebug "userListId" userListId
165 _ <- mkGraph userCorpusId userId
168 -- User Dashboard Flow
169 _ <- mkDashboard userCorpusId userId
172 -- _ <- mkAnnuaire rootUserId userId
176 insertMasterDocs :: ( FlowCmdM env ServantErr m
180 => Maybe c -> TermType Lang -> [a] -> m [DocId]
181 insertMasterDocs c lang hs = do
182 (masterUserId, _, masterCorpusId) <- getOrMkRootWithCorpus userMaster corpusMasterName c
184 -- TODO Type NodeDocumentUnicised
185 let hs' = map addUniqId hs
186 ids <- insertDb masterUserId masterCorpusId hs'
187 let documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' hs')
189 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
190 maps <- mapNodeIdNgrams <$> documentIdWithNgrams (extractNgramsT lang) documentsWithId
191 terms2id <- insertNgrams $ Map.keys maps
192 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) maps
194 lId <- getOrMkList masterCorpusId masterUserId
195 _ <- insertDocNgrams lId indexedNgrams
200 type CorpusName = Text
202 getOrMkRootWithCorpus :: (HasNodeError err, MkCorpus a)
203 => Username -> CorpusName -> Maybe a
204 -> Cmd err (UserId, RootId, CorpusId)
205 getOrMkRootWithCorpus username cName c = do
206 maybeUserId <- getUser username
207 userId <- case maybeUserId of
208 Nothing -> nodeError NoUserFound
209 Just user -> pure $ userLight_id user
211 rootId' <- map _node_id <$> getRoot username
213 rootId'' <- case rootId' of
214 [] -> mkRoot username userId
215 n -> case length n >= 2 of
216 True -> nodeError ManyNodeUsers
217 False -> pure rootId'
219 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
221 corpusId'' <- if username == userMaster
223 ns <- getCorporaWithParentId rootId
224 pure $ map _node_id ns
228 corpusId' <- if corpusId'' /= []
230 else mk (Just cName) c rootId userId
232 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
234 pure (userId, rootId, corpusId)
237 ------------------------------------------------------------------------
242 uniqId :: Lens' a (Maybe HashId)
245 instance UniqId HyperdataDocument
247 uniqId = hyperdataDocument_uniqId
249 instance UniqId HyperdataContact
253 viewUniqId' :: UniqId a => a -> (HashId, a)
254 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
256 err = panic "[ERROR] Database.Flow.toInsert"
259 toInserted :: [ReturnId] -> Map HashId ReturnId
260 toInserted = Map.fromList . map (\r -> (reUniqId r, r) )
261 . filter (\r -> reInserted r == True)
263 data DocumentWithId a = DocumentWithId
264 { documentId :: !NodeId
268 mergeData :: Map HashId ReturnId
270 -> [DocumentWithId a]
271 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
273 toDocumentWithId (hash,hpd) =
274 DocumentWithId <$> fmap reId (lookup hash rs)
277 ------------------------------------------------------------------------
278 data DocumentIdWithNgrams a = DocumentIdWithNgrams
279 { documentWithId :: !(DocumentWithId a)
280 , document_ngrams :: !(Map Ngrams (Map NgramsType Int))
283 -- TODO extractNgrams according to Type of Data
285 class ExtractNgramsT h
287 extractNgramsT :: TermType Lang -> h -> Cmd err (Map Ngrams (Map NgramsType Int))
290 instance ExtractNgramsT HyperdataContact
292 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
294 extract :: TermType Lang -> HyperdataContact
295 -> Cmd err (Map Ngrams (Map NgramsType Int))
297 let authors = map text2ngrams
298 $ maybe ["Nothing"] (\a -> [a])
299 $ view (hc_who . _Just . cw_lastName) hc'
301 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
305 instance ExtractNgramsT HyperdataDocument
307 extractNgramsT = extractNgramsT'
309 extractNgramsT' :: TermType Lang -> HyperdataDocument
310 -> Cmd err (Map Ngrams (Map NgramsType Int))
311 extractNgramsT' lang hd = filterNgramsT 255 <$> extractNgramsT'' lang hd
313 extractNgramsT'' :: TermType Lang -> HyperdataDocument
314 -> Cmd err (Map Ngrams (Map NgramsType Int))
315 extractNgramsT'' lang' doc = do
316 let source = text2ngrams
317 $ maybe "Nothing" identity
318 $ _hyperdataDocument_source doc
320 institutes = map text2ngrams
321 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
322 $ _hyperdataDocument_institutes doc
324 authors = map text2ngrams
325 $ maybe ["Nothing"] (splitOn ", ")
326 $ _hyperdataDocument_authors doc
328 leText = catMaybes [ _hyperdataDocument_title doc
329 , _hyperdataDocument_abstract doc
332 terms' <- map text2ngrams
333 <$> map (intercalate " " . _terms_label)
335 <$> liftIO (extractTerms lang' leText)
337 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
338 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
339 <> [(a', Map.singleton Authors 1) | a' <- authors ]
340 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
343 filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
344 -> Map Ngrams (Map NgramsType Int)
345 filterNgramsT s ms = Map.fromList $ map (\a -> filter' s a) $ Map.toList ms
347 filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
349 False -> (Ngrams (Text.take s' t) n , y)
352 documentIdWithNgrams :: HasNodeError err
354 -> Cmd err (Map Ngrams (Map NgramsType Int)))
355 -> [DocumentWithId a]
356 -> Cmd err [DocumentIdWithNgrams a]
357 documentIdWithNgrams f = mapM toDocumentIdWithNgrams
359 toDocumentIdWithNgrams d = do
360 e <- f $ documentData d
361 pure $ DocumentIdWithNgrams d e
365 -- | TODO check optimization
366 mapNodeIdNgrams :: [DocumentIdWithNgrams a]
367 -> Map Ngrams (Map NgramsType (Map NodeId Int))
368 mapNodeIdNgrams = Map.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
370 f :: DocumentIdWithNgrams a
371 -> Map Ngrams (Map NgramsType (Map NodeId Int))
372 f d = fmap (fmap (Map.singleton nId)) $ document_ngrams d
374 nId = documentId $ documentWithId d
376 ------------------------------------------------------------------------
377 listInsert :: FlowCmdM env err m
378 => ListId -> Map NgramsType [NgramsElement]
380 listInsert lId ngs = mapM_ (\(typeList, ngElmts)
381 -> putListNgrams lId typeList ngElmts
384 flowList :: FlowCmdM env err m => UserId -> CorpusId
385 -> Map NgramsType [NgramsElement]
387 flowList uId cId ngs = do
388 lId <- getOrMkList cId uId
389 printDebug "listId flowList" lId
391 --trace (show $ List.filter (\n -> _ne_ngrams n == "versatile") $ List.concat $ Map.elems ngs) $ listInsert lId ngs