2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 -- check userId CanFillUserCorpus userCorpusId
13 -- check masterUserId CanFillMasterCorpus masterCorpusId
17 {-# LANGUAGE ConstraintKinds #-}
18 {-# LANGUAGE DeriveGeneric #-}
19 {-# LANGUAGE NoImplicitPrelude #-}
20 {-# LANGUAGE OverloadedStrings #-}
21 {-# LANGUAGE RankNTypes #-}
22 {-# LANGUAGE FlexibleContexts #-}
24 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
27 --import Gargantext.Database.Metrics.Count (getNgramsElementsWithParentNodeId)
28 --import Gargantext.Database.Metrics.TFICF (getTficf)
29 --import Gargantext.Database.Node.Contact (HyperdataContact(..))
30 --import Gargantext.Database.Schema.NodeNgram (NodeNgramPoly(..), insertNodeNgrams)
31 --import Gargantext.Database.Schema.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNodeNgramsNgramsNew)
32 --import Gargantext.Database.Schema.User (insertUsers, simpleUser, gargantuaUser)
33 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
34 --import Gargantext.Text.Metrics.TFICF (Tficf(..))
35 --import Debug.Trace (trace)
36 import Control.Lens ((^.), view, Lens', _Just)
37 import Control.Monad (mapM_)
38 import Control.Monad.IO.Class (liftIO)
39 import Data.List (concat)
40 import Data.Map (Map, lookup, toList)
41 import Data.Maybe (Maybe(..), catMaybes)
43 import Data.Text (Text, splitOn, intercalate)
44 import GHC.Show (Show)
45 import Gargantext.API.Ngrams (HasRepoVar)
46 import Gargantext.API.Ngrams (NgramsElement, putListNgrams, RepoCmdM)
47 import Gargantext.Core (Lang(..))
48 import Gargantext.Core.Types (NodePoly(..), Terms(..))
49 import Gargantext.Core.Types.Individu (Username)
50 import Gargantext.Core.Types.Main
51 import Gargantext.Database.TextSearch (searchInDatabase)
52 import Gargantext.Database.Config (userMaster, corpusMasterName)
53 import Gargantext.Database.Flow.Utils (insertToNodeNgrams)
54 import Gargantext.Database.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
55 import Gargantext.Database.Node.Contact -- (HyperdataContact(..), ContactWho(..))
56 import Gargantext.Database.Root (getRoot)
57 import Gargantext.Database.Schema.Ngrams -- (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
58 import Gargantext.Database.Schema.Node -- (mkRoot, mkCorpus, getOrMkList, mkGraph, mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
59 import Gargantext.Database.Schema.User (getUser, UserLight(..))
60 import Gargantext.Database.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
61 import Gargantext.Database.Utils (Cmd, CmdM)
62 import Gargantext.Ext.IMT (toSchoolName)
63 import Gargantext.Prelude
64 import Gargantext.Text.List (buildNgramsLists,StopSize(..))
65 import Gargantext.Text.Parsers (parseDocs, FileFormat)
66 import Gargantext.Text.Terms (TermType(..), tt_lang)
67 import Gargantext.Text.Terms (extractTerms)
68 import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
69 import qualified Gargantext.Text.Parsers.GrandDebat as GD
70 import Servant (ServantErr)
71 import System.FilePath (FilePath)
72 import qualified Data.Map as DM
73 import qualified Data.Text as Text
74 import qualified Gargantext.Database.Node.Document.Add as Doc (add)
76 type FlowCmdM env err m =
83 type FlowCorpus a = ( AddUniqId a
89 ------------------------------------------------------------------------
91 flowAnnuaire :: FlowCmdM env ServantErr m
92 => Username -> CorpusName -> (TermType Lang) -> FilePath -> m AnnuaireId
93 flowAnnuaire u n l filePath = do
94 docs <- liftIO $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
95 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
98 flowCorpusDebat :: FlowCmdM env ServantErr m
99 => Username -> CorpusName
102 flowCorpusDebat u n l fp = do
103 docs <- liftIO ( splitEvery 500
106 :: IO [[GD.GrandDebatReference ]]
108 flowCorpus u n (Multi FR) (map (map toHyperdataDocument) docs)
111 flowCorpusFile :: FlowCmdM env ServantErr m
112 => Username -> CorpusName
113 -> Limit -- ^ Limit the number of docs (for dev purpose)
114 -> TermType Lang -> FileFormat -> FilePath
116 flowCorpusFile u n l la ff fp = do
117 docs <- liftIO ( splitEvery 500
121 flowCorpus u n la (map (map toHyperdataDocument) docs)
123 -- TODO query with complex query
124 flowCorpusSearchInDatabase :: FlowCmdM env ServantErr m
125 => Username -> Lang -> Text -> m CorpusId
126 flowCorpusSearchInDatabase u la q = do
127 (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus userMaster "" (Nothing :: Maybe HyperdataCorpus)
128 ids <- map fst <$> searchInDatabase cId (stemIt q)
129 flowCorpusUser la u q (Nothing :: Maybe HyperdataCorpus) ids
131 ------------------------------------------------------------------------
133 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
134 -- TODO-EVENTS: InsertedNodes
137 flow :: (FlowCmdM env ServantErr m, FlowCorpus a, MkCorpus c)
138 => Maybe c -> Username -> CorpusName -> TermType Lang -> [[a]] -> m CorpusId
139 flow c u cn la docs = do
140 ids <- mapM (insertMasterDocs c la ) docs
141 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
143 flowCorpus :: (FlowCmdM env ServantErr m, FlowCorpus a)
144 => Username -> CorpusName -> TermType Lang -> [[a]] -> m CorpusId
145 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
148 flowCorpusUser :: (FlowCmdM env ServantErr m, MkCorpus c)
149 => Lang -> Username -> CorpusName -> Maybe c -> [NodeId] -> m CorpusId
150 flowCorpusUser l userName corpusName ctype ids = do
152 (userId, _rootId, userCorpusId) <- getOrMkRootWithCorpus userName corpusName ctype
153 -- TODO: check if present already, ignore
154 _ <- Doc.add userCorpusId ids
158 (_masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster "" ctype
159 ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
160 userListId <- flowList userId userCorpusId ngs
161 printDebug "userListId" userListId
163 _ <- mkGraph userCorpusId userId
166 -- User Dashboard Flow
167 _ <- mkDashboard userCorpusId userId
170 -- _ <- mkAnnuaire rootUserId userId
174 insertMasterDocs :: ( FlowCmdM env ServantErr m
178 => Maybe c -> TermType Lang -> [a] -> m [DocId]
179 insertMasterDocs c lang hs = do
180 (masterUserId, _, masterCorpusId) <- getOrMkRootWithCorpus userMaster corpusMasterName c
182 -- TODO Type NodeDocumentUnicised
183 let hs' = map addUniqId hs
184 ids <- insertDb masterUserId masterCorpusId hs'
185 let documentsWithId = mergeData (toInserted ids) (DM.fromList $ map viewUniqId' hs')
187 docsWithNgrams <- documentIdWithNgrams (extractNgramsT lang) documentsWithId
189 let maps = mapNodeIdNgrams docsWithNgrams
191 terms2id <- insertNgrams $ DM.keys maps
192 let indexedNgrams = DM.mapKeys (indexNgrams terms2id) maps
193 _ <- insertToNodeNgrams indexedNgrams
198 type CorpusName = Text
200 getOrMkRootWithCorpus :: (HasNodeError err, MkCorpus a)
201 => Username -> CorpusName -> Maybe a
202 -> Cmd err (UserId, RootId, CorpusId)
203 getOrMkRootWithCorpus username cName c = do
204 maybeUserId <- getUser username
205 userId <- case maybeUserId of
206 Nothing -> nodeError NoUserFound
207 Just user -> pure $ userLight_id user
209 rootId' <- map _node_id <$> getRoot username
211 rootId'' <- case rootId' of
212 [] -> mkRoot username userId
213 n -> case length n >= 2 of
214 True -> nodeError ManyNodeUsers
215 False -> pure rootId'
217 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
219 corpusId'' <- if username == userMaster
221 ns <- getCorporaWithParentId rootId
222 pure $ map _node_id ns
226 corpusId' <- if corpusId'' /= []
228 else mk (Just cName) c rootId userId
230 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
232 pure (userId, rootId, corpusId)
235 ------------------------------------------------------------------------
240 uniqId :: Lens' a (Maybe HashId)
243 instance UniqId HyperdataDocument
245 uniqId = hyperdataDocument_uniqId
247 instance UniqId HyperdataContact
251 viewUniqId' :: UniqId a => a -> (HashId, a)
252 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
254 err = panic "[ERROR] Database.Flow.toInsert"
257 toInserted :: [ReturnId] -> Map HashId ReturnId
258 toInserted = DM.fromList . map (\r -> (reUniqId r, r) )
259 . filter (\r -> reInserted r == True)
261 data DocumentWithId a = DocumentWithId
262 { documentId :: !NodeId
266 mergeData :: Map HashId ReturnId
268 -> [DocumentWithId a]
269 mergeData rs = catMaybes . map toDocumentWithId . DM.toList
271 toDocumentWithId (hash,hpd) =
272 DocumentWithId <$> fmap reId (lookup hash rs)
275 ------------------------------------------------------------------------
276 data DocumentIdWithNgrams a = DocumentIdWithNgrams
277 { documentWithId :: !(DocumentWithId a)
278 , document_ngrams :: !(Map Ngrams (Map NgramsType Int))
281 -- TODO extractNgrams according to Type of Data
283 class ExtractNgramsT h
285 extractNgramsT :: TermType Lang -> h -> Cmd err (Map Ngrams (Map NgramsType Int))
288 instance ExtractNgramsT HyperdataContact
290 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
292 extract :: TermType Lang -> HyperdataContact
293 -> Cmd err (Map Ngrams (Map NgramsType Int))
295 let authors = map text2ngrams
296 $ maybe ["Nothing"] (\a -> [a])
297 $ view (hc_who . _Just . cw_lastName) hc'
299 pure $ DM.fromList $ [(a', DM.singleton Authors 1) | a' <- authors ]
304 instance ExtractNgramsT HyperdataDocument
306 extractNgramsT = extractNgramsT'
308 extractNgramsT' :: TermType Lang -> HyperdataDocument
309 -> Cmd err (Map Ngrams (Map NgramsType Int))
310 extractNgramsT' lang hd = filterNgramsT 255 <$> extractNgramsT'' lang hd
312 extractNgramsT'' :: TermType Lang -> HyperdataDocument
313 -> Cmd err (Map Ngrams (Map NgramsType Int))
314 extractNgramsT'' lang' doc = do
315 let source = text2ngrams
316 $ maybe "Nothing" identity
317 $ _hyperdataDocument_source doc
319 institutes = map text2ngrams
320 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
321 $ _hyperdataDocument_institutes doc
323 authors = map text2ngrams
324 $ maybe ["Nothing"] (splitOn ", ")
325 $ _hyperdataDocument_authors doc
327 leText = catMaybes [ _hyperdataDocument_title doc
328 , _hyperdataDocument_abstract doc
331 terms' <- map text2ngrams
332 <$> map (intercalate " " . _terms_label)
334 <$> liftIO (extractTerms lang' leText)
336 pure $ DM.fromList $ [(source, DM.singleton Sources 1)]
337 <> [(i', DM.singleton Institutes 1) | i' <- institutes ]
338 <> [(a', DM.singleton Authors 1) | a' <- authors ]
339 <> [(t', DM.singleton NgramsTerms 1) | t' <- terms' ]
342 filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
343 -> Map Ngrams (Map NgramsType Int)
344 filterNgramsT s ms = DM.fromList $ map (\a -> filter' s a) $ DM.toList ms
346 filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
348 False -> (Ngrams (Text.take s' t) n , y)
351 documentIdWithNgrams :: HasNodeError err
353 -> Cmd err (Map Ngrams (Map NgramsType Int)))
354 -> [DocumentWithId a]
355 -> Cmd err [DocumentIdWithNgrams a]
356 documentIdWithNgrams f = mapM toDocumentIdWithNgrams
358 toDocumentIdWithNgrams d = do
359 e <- f $ documentData d
360 pure $ DocumentIdWithNgrams d e
365 -- | TODO check optimization
366 mapNodeIdNgrams :: [DocumentIdWithNgrams a]
367 -> Map Ngrams (Map NgramsType (Map NodeId Int))
368 mapNodeIdNgrams = DM.unionsWith (DM.unionWith (DM.unionWith (+))) . fmap f
370 f :: DocumentIdWithNgrams a
371 -> Map Ngrams (Map NgramsType (Map NodeId Int))
372 f d = fmap (fmap (DM.singleton nId)) $ document_ngrams d
374 nId = documentId $ documentWithId d
376 ------------------------------------------------------------------------
377 listInsert :: FlowCmdM env err m
378 => ListId -> Map NgramsType [NgramsElement]
380 listInsert lId ngs = mapM_ (\(typeList, ngElmts)
381 -> putListNgrams lId typeList ngElmts
384 flowList :: FlowCmdM env err m => UserId -> CorpusId
385 -> Map NgramsType [NgramsElement]
387 flowList uId cId ngs = do
388 lId <- getOrMkList cId uId
389 printDebug "listId flowList" lId