2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 -- check userId CanFillUserCorpus userCorpusId
13 -- check masterUserId CanFillMasterCorpus masterCorpusId
15 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
16 -- TODO-EVENTS: InsertedNodes
21 {-# LANGUAGE ConstraintKinds #-}
22 {-# LANGUAGE DeriveGeneric #-}
23 {-# LANGUAGE NoImplicitPrelude #-}
24 {-# LANGUAGE OverloadedStrings #-}
25 {-# LANGUAGE RankNTypes #-}
26 {-# LANGUAGE FlexibleContexts #-}
28 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
31 import Control.Lens ((^.), view, Lens', _Just)
32 import Control.Monad (mapM_)
33 import Control.Monad.IO.Class (liftIO)
34 import Data.List (concat)
35 import Data.Map (Map, lookup, toList)
36 import Data.Maybe (Maybe(..), catMaybes)
38 import Data.Text (Text, splitOn, intercalate)
39 import GHC.Show (Show)
40 import Gargantext.API.Ngrams (HasRepoVar)
41 import Gargantext.API.Ngrams (NgramsElement, putListNgrams, RepoCmdM)
42 import Gargantext.Core (Lang(..))
43 import Gargantext.Core.Types (NodePoly(..), Terms(..))
44 import Gargantext.Core.Types.Individu (Username)
45 import Gargantext.Core.Types.Main
46 import Gargantext.Database.Config (userMaster, corpusMasterName)
47 import Gargantext.Database.Flow.Utils (insertToNodeNgrams)
48 import Gargantext.Database.Node.Contact -- (HyperdataContact(..), ContactWho(..))
49 import Gargantext.Database.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
50 import Gargantext.Database.Root (getRoot)
51 import Gargantext.Database.Schema.Ngrams -- (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
52 import Gargantext.Database.Schema.Node -- (mkRoot, mkCorpus, getOrMkList, mkGraph, mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
53 import Gargantext.Database.Schema.User (getUser, UserLight(..))
54 import Gargantext.Database.TextSearch (searchInDatabase)
55 import Gargantext.Database.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
56 import Gargantext.Database.Utils (Cmd, CmdM)
57 import Gargantext.Ext.IMT (toSchoolName)
58 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
59 import Gargantext.Prelude
60 import Gargantext.Text.List (buildNgramsLists,StopSize(..))
61 import Gargantext.Text.Parsers (parseDocs, FileFormat)
62 import Gargantext.Text.Terms (TermType(..), tt_lang)
63 import Gargantext.Text.Terms (extractTerms)
64 import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
65 import Servant (ServantErr)
66 import System.FilePath (FilePath)
67 import qualified Data.Map as DM
68 import qualified Data.Text as Text
69 import qualified Gargantext.Database.Node.Document.Add as Doc (add)
70 import qualified Gargantext.Text.Parsers.GrandDebat as GD
72 type FlowCmdM env err m =
79 type FlowCorpus a = ( AddUniqId a
85 ------------------------------------------------------------------------
87 flowAnnuaire :: FlowCmdM env ServantErr m
88 => Username -> CorpusName -> (TermType Lang) -> FilePath -> m AnnuaireId
89 flowAnnuaire u n l filePath = do
90 docs <- liftIO $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
91 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
94 flowCorpusDebat :: FlowCmdM env ServantErr m
95 => Username -> CorpusName
98 flowCorpusDebat u n l fp = do
99 docs <- liftIO ( splitEvery 500
102 :: IO [[GD.GrandDebatReference ]]
104 flowCorpus u n (Multi FR) (map (map toHyperdataDocument) docs)
107 flowCorpusFile :: FlowCmdM env ServantErr m
108 => Username -> CorpusName
109 -> Limit -- ^ Limit the number of docs (for dev purpose)
110 -> TermType Lang -> FileFormat -> FilePath
112 flowCorpusFile u n l la ff fp = do
113 docs <- liftIO ( splitEvery 500
117 flowCorpus u n la (map (map toHyperdataDocument) docs)
119 -- TODO query with complex query
120 flowCorpusSearchInDatabase :: FlowCmdM env ServantErr m
121 => Username -> Lang -> Text -> m CorpusId
122 flowCorpusSearchInDatabase u la q = do
123 (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus userMaster "" (Nothing :: Maybe HyperdataCorpus)
124 ids <- map fst <$> searchInDatabase cId (stemIt q)
125 flowCorpusUser la u q (Nothing :: Maybe HyperdataCorpus) ids
127 ------------------------------------------------------------------------
130 flow :: (FlowCmdM env ServantErr m, FlowCorpus a, MkCorpus c)
131 => Maybe c -> Username -> CorpusName -> TermType Lang -> [[a]] -> m CorpusId
132 flow c u cn la docs = do
133 ids <- mapM (insertMasterDocs c la ) docs
134 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
136 flowCorpus :: (FlowCmdM env ServantErr m, FlowCorpus a)
137 => Username -> CorpusName -> TermType Lang -> [[a]] -> m CorpusId
138 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
141 flowCorpusUser :: (FlowCmdM env ServantErr m, MkCorpus c)
142 => Lang -> Username -> CorpusName -> Maybe c -> [NodeId] -> m CorpusId
143 flowCorpusUser l userName corpusName ctype ids = do
145 (userId, _rootId, userCorpusId) <- getOrMkRootWithCorpus userName corpusName ctype
146 -- TODO: check if present already, ignore
147 _ <- Doc.add userCorpusId ids
151 (_masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster "" ctype
152 ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
153 userListId <- flowList userId userCorpusId ngs
154 printDebug "userListId" userListId
156 _ <- mkGraph userCorpusId userId
159 -- User Dashboard Flow
160 _ <- mkDashboard userCorpusId userId
163 -- _ <- mkAnnuaire rootUserId userId
167 insertMasterDocs :: ( FlowCmdM env ServantErr m
171 => Maybe c -> TermType Lang -> [a] -> m [DocId]
172 insertMasterDocs c lang hs = do
173 (masterUserId, _, masterCorpusId) <- getOrMkRootWithCorpus userMaster corpusMasterName c
175 -- TODO Type NodeDocumentUnicised
176 let hs' = map addUniqId hs
177 ids <- insertDb masterUserId masterCorpusId hs'
178 let documentsWithId = mergeData (toInserted ids) (DM.fromList $ map viewUniqId' hs')
180 docsWithNgrams <- documentIdWithNgrams (extractNgramsT lang) documentsWithId
182 let maps = mapNodeIdNgrams docsWithNgrams
184 terms2id <- insertNgrams $ DM.keys maps
185 let indexedNgrams = DM.mapKeys (indexNgrams terms2id) maps
186 _ <- insertToNodeNgrams indexedNgrams
191 type CorpusName = Text
193 getOrMkRootWithCorpus :: (HasNodeError err, MkCorpus a)
194 => Username -> CorpusName -> Maybe a
195 -> Cmd err (UserId, RootId, CorpusId)
196 getOrMkRootWithCorpus username cName c = do
197 maybeUserId <- getUser username
198 userId <- case maybeUserId of
199 Nothing -> nodeError NoUserFound
200 Just user -> pure $ userLight_id user
202 rootId' <- map _node_id <$> getRoot username
204 rootId'' <- case rootId' of
205 [] -> mkRoot username userId
206 n -> case length n >= 2 of
207 True -> nodeError ManyNodeUsers
208 False -> pure rootId'
210 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
212 corpusId'' <- if username == userMaster
214 ns <- getCorporaWithParentId rootId
215 pure $ map _node_id ns
219 corpusId' <- if corpusId'' /= []
221 else mk (Just cName) c rootId userId
223 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
225 pure (userId, rootId, corpusId)
228 ------------------------------------------------------------------------
233 uniqId :: Lens' a (Maybe HashId)
236 instance UniqId HyperdataDocument
238 uniqId = hyperdataDocument_uniqId
240 instance UniqId HyperdataContact
244 viewUniqId' :: UniqId a => a -> (HashId, a)
245 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
247 err = panic "[ERROR] Database.Flow.toInsert"
250 toInserted :: [ReturnId] -> Map HashId ReturnId
251 toInserted = DM.fromList . map (\r -> (reUniqId r, r) )
252 . filter (\r -> reInserted r == True)
254 data DocumentWithId a = DocumentWithId
255 { documentId :: !NodeId
259 mergeData :: Map HashId ReturnId
261 -> [DocumentWithId a]
262 mergeData rs = catMaybes . map toDocumentWithId . DM.toList
264 toDocumentWithId (hash,hpd) =
265 DocumentWithId <$> fmap reId (lookup hash rs)
268 ------------------------------------------------------------------------
269 data DocumentIdWithNgrams a = DocumentIdWithNgrams
270 { documentWithId :: !(DocumentWithId a)
271 , document_ngrams :: !(Map Ngrams (Map NgramsType Int))
274 -- TODO extractNgrams according to Type of Data
276 class ExtractNgramsT h
278 extractNgramsT :: TermType Lang -> h -> Cmd err (Map Ngrams (Map NgramsType Int))
281 instance ExtractNgramsT HyperdataContact
283 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
285 extract :: TermType Lang -> HyperdataContact
286 -> Cmd err (Map Ngrams (Map NgramsType Int))
288 let authors = map text2ngrams
289 $ maybe ["Nothing"] (\a -> [a])
290 $ view (hc_who . _Just . cw_lastName) hc'
292 pure $ DM.fromList $ [(a', DM.singleton Authors 1) | a' <- authors ]
297 instance ExtractNgramsT HyperdataDocument
299 extractNgramsT = extractNgramsT'
301 extractNgramsT' :: TermType Lang -> HyperdataDocument
302 -> Cmd err (Map Ngrams (Map NgramsType Int))
303 extractNgramsT' lang hd = filterNgramsT 255 <$> extractNgramsT'' lang hd
305 extractNgramsT'' :: TermType Lang -> HyperdataDocument
306 -> Cmd err (Map Ngrams (Map NgramsType Int))
307 extractNgramsT'' lang' doc = do
308 let source = text2ngrams
309 $ maybe "Nothing" identity
310 $ _hyperdataDocument_source doc
312 institutes = map text2ngrams
313 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
314 $ _hyperdataDocument_institutes doc
316 authors = map text2ngrams
317 $ maybe ["Nothing"] (splitOn ", ")
318 $ _hyperdataDocument_authors doc
320 leText = catMaybes [ _hyperdataDocument_title doc
321 , _hyperdataDocument_abstract doc
324 terms' <- map text2ngrams
325 <$> map (intercalate " " . _terms_label)
327 <$> liftIO (extractTerms lang' leText)
329 pure $ DM.fromList $ [(source, DM.singleton Sources 1)]
330 <> [(i', DM.singleton Institutes 1) | i' <- institutes ]
331 <> [(a', DM.singleton Authors 1) | a' <- authors ]
332 <> [(t', DM.singleton NgramsTerms 1) | t' <- terms' ]
335 filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
336 -> Map Ngrams (Map NgramsType Int)
337 filterNgramsT s ms = DM.fromList $ map (\a -> filter' s a) $ DM.toList ms
339 filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
341 False -> (Ngrams (Text.take s' t) n , y)
344 documentIdWithNgrams :: HasNodeError err
346 -> Cmd err (Map Ngrams (Map NgramsType Int)))
347 -> [DocumentWithId a]
348 -> Cmd err [DocumentIdWithNgrams a]
349 documentIdWithNgrams f = mapM toDocumentIdWithNgrams
351 toDocumentIdWithNgrams d = do
352 e <- f $ documentData d
353 pure $ DocumentIdWithNgrams d e
358 -- | TODO check optimization
359 mapNodeIdNgrams :: [DocumentIdWithNgrams a]
360 -> Map Ngrams (Map NgramsType (Map NodeId Int))
361 mapNodeIdNgrams = DM.unionsWith (DM.unionWith (DM.unionWith (+))) . fmap f
363 f :: DocumentIdWithNgrams a
364 -> Map Ngrams (Map NgramsType (Map NodeId Int))
365 f d = fmap (fmap (DM.singleton nId)) $ document_ngrams d
367 nId = documentId $ documentWithId d
369 ------------------------------------------------------------------------
370 listInsert :: FlowCmdM env err m
371 => ListId -> Map NgramsType [NgramsElement]
373 listInsert lId ngs = mapM_ (\(typeList, ngElmts)
374 -> putListNgrams lId typeList ngElmts
377 flowList :: FlowCmdM env err m => UserId -> CorpusId
378 -> Map NgramsType [NgramsElement]
380 flowList uId cId ngs = do
381 lId <- getOrMkList cId uId
382 printDebug "listId flowList" lId