]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Flow.hs
test
[gargantext.git] / src / Gargantext / Database / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10
11 -- TODO-ACCESS:
12 -- check userId CanFillUserCorpus userCorpusId
13 -- check masterUserId CanFillMasterCorpus masterCorpusId
14
15 -}
16
17 {-# LANGUAGE ConstraintKinds #-}
18 {-# LANGUAGE DeriveGeneric #-}
19 {-# LANGUAGE NoImplicitPrelude #-}
20 {-# LANGUAGE OverloadedStrings #-}
21 {-# LANGUAGE RankNTypes #-}
22 {-# LANGUAGE FlexibleContexts #-}
23
24 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
25 where
26
27 --import Gargantext.Database.Metrics.Count (getNgramsElementsWithParentNodeId)
28 --import Gargantext.Database.Metrics.TFICF (getTficf)
29 --import Gargantext.Database.Node.Contact (HyperdataContact(..))
30 --import Gargantext.Database.Schema.NodeNgram (NodeNgramPoly(..), insertNodeNgrams)
31 --import Gargantext.Database.Schema.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNodeNgramsNgramsNew)
32 --import Gargantext.Database.Schema.User (insertUsers, simpleUser, gargantuaUser)
33 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
34 --import Gargantext.Text.Metrics.TFICF (Tficf(..))
35 --import Debug.Trace (trace)
36 import Control.Lens ((^.), view, Lens', _Just)
37 import Control.Monad (mapM_)
38 import Control.Monad.IO.Class (liftIO)
39 import Data.List (concat)
40 import Data.Map (Map, lookup, toList)
41 import Data.Maybe (Maybe(..), catMaybes)
42 import Data.Monoid
43 import Data.Text (Text, splitOn, intercalate)
44 import GHC.Show (Show)
45 import Gargantext.API.Ngrams (HasRepoVar)
46 import Gargantext.API.Ngrams (NgramsElement, putListNgrams, RepoCmdM)
47 import Gargantext.Core (Lang(..))
48 import Gargantext.Core.Types (NodePoly(..), Terms(..))
49 import Gargantext.Core.Types.Individu (Username)
50 import Gargantext.Core.Types.Main
51 import Gargantext.Database.TextSearch (searchInDatabase)
52 import Gargantext.Database.Config (userMaster, corpusMasterName)
53 import Gargantext.Database.Flow.Utils (insertToNodeNgrams)
54 import Gargantext.Database.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
55 import Gargantext.Database.Node.Contact -- (HyperdataContact(..), ContactWho(..))
56 import Gargantext.Database.Root (getRoot)
57 import Gargantext.Database.Schema.Ngrams -- (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
58 import Gargantext.Database.Schema.Node -- (mkRoot, mkCorpus, getOrMkList, mkGraph, mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
59 import Gargantext.Database.Schema.User (getUser, UserLight(..))
60 import Gargantext.Database.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
61 import Gargantext.Database.Utils (Cmd, CmdM)
62 import Gargantext.Ext.IMT (toSchoolName)
63 import Gargantext.Prelude
64 import Gargantext.Text.List (buildNgramsLists,StopSize(..))
65 import Gargantext.Text.Parsers (parseDocs, FileFormat)
66 import Gargantext.Text.Terms (TermType(..), tt_lang)
67 import Gargantext.Text.Terms (extractTerms)
68 import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
69 import qualified Gargantext.Text.Parsers.GrandDebat as GD
70 import Servant (ServantErr)
71 import System.FilePath (FilePath)
72 import qualified Data.Map as DM
73 import qualified Data.Text as Text
74 import qualified Gargantext.Database.Node.Document.Add as Doc (add)
75
76 type FlowCmdM env err m =
77 ( CmdM env err m
78 , RepoCmdM env err m
79 , HasNodeError err
80 , HasRepoVar env
81 )
82
83 type FlowCorpus a = ( AddUniqId a
84 , UniqId a
85 , InsertDb a
86 , ExtractNgramsT a
87 )
88
89 ------------------------------------------------------------------------
90
91 flowAnnuaire :: FlowCmdM env ServantErr m
92 => Username -> CorpusName -> (TermType Lang) -> FilePath -> m AnnuaireId
93 flowAnnuaire u n l filePath = do
94 docs <- liftIO $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
95 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
96
97
98 flowCorpusDebat :: FlowCmdM env ServantErr m
99 => Username -> CorpusName
100 -> Limit -> FilePath
101 -> m CorpusId
102 flowCorpusDebat u n l fp = do
103 docs <- liftIO ( splitEvery 500
104 <$> take l
105 <$> GD.readFile fp
106 :: IO [[GD.GrandDebatReference ]]
107 )
108 flowCorpus u n (Multi FR) (map (map toHyperdataDocument) docs)
109
110
111 flowCorpusFile :: FlowCmdM env ServantErr m
112 => Username -> CorpusName
113 -> Limit -- ^ Limit the number of docs (for dev purpose)
114 -> TermType Lang -> FileFormat -> FilePath
115 -> m CorpusId
116 flowCorpusFile u n l la ff fp = do
117 docs <- liftIO ( splitEvery 500
118 <$> take l
119 <$> parseDocs ff fp
120 )
121 flowCorpus u n la (map (map toHyperdataDocument) docs)
122
123 -- TODO query with complex query
124 flowCorpusSearchInDatabase :: FlowCmdM env ServantErr m
125 => Username -> Lang -> Text -> m CorpusId
126 flowCorpusSearchInDatabase u la q = do
127 (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus userMaster "" (Nothing :: Maybe HyperdataCorpus)
128 ids <- map fst <$> searchInDatabase cId (stemIt q)
129 flowCorpusUser la u q (Nothing :: Maybe HyperdataCorpus) ids
130
131 ------------------------------------------------------------------------
132
133 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
134 -- TODO-EVENTS: InsertedNodes
135
136
137 flow :: (FlowCmdM env ServantErr m, FlowCorpus a, MkCorpus c)
138 => Maybe c -> Username -> CorpusName -> TermType Lang -> [[a]] -> m CorpusId
139 flow c u cn la docs = do
140 ids <- mapM (insertMasterDocs c la ) docs
141 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
142
143 flowCorpus :: (FlowCmdM env ServantErr m, FlowCorpus a)
144 => Username -> CorpusName -> TermType Lang -> [[a]] -> m CorpusId
145 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
146
147
148 flowCorpusUser :: (FlowCmdM env ServantErr m, MkCorpus c)
149 => Lang -> Username -> CorpusName -> Maybe c -> [NodeId] -> m CorpusId
150 flowCorpusUser l userName corpusName ctype ids = do
151 -- User Flow
152 (userId, _rootId, userCorpusId) <- getOrMkRootWithCorpus userName corpusName ctype
153 -- TODO: check if present already, ignore
154 _ <- Doc.add userCorpusId ids
155
156 -- User List Flow
157 --{-
158 (_masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster "" ctype
159 ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
160 userListId <- flowList userId userCorpusId ngs
161 printDebug "userListId" userListId
162 -- User Graph Flow
163 _ <- mkGraph userCorpusId userId
164 --}
165
166 -- User Dashboard Flow
167 _ <- mkDashboard userCorpusId userId
168
169 -- Annuaire Flow
170 -- _ <- mkAnnuaire rootUserId userId
171 pure userCorpusId
172
173
174 insertMasterDocs :: ( FlowCmdM env ServantErr m
175 , FlowCorpus a
176 , MkCorpus c
177 )
178 => Maybe c -> TermType Lang -> [a] -> m [DocId]
179 insertMasterDocs c lang hs = do
180 (masterUserId, _, masterCorpusId) <- getOrMkRootWithCorpus userMaster corpusMasterName c
181
182 -- TODO Type NodeDocumentUnicised
183 let hs' = map addUniqId hs
184 ids <- insertDb masterUserId masterCorpusId hs'
185 let documentsWithId = mergeData (toInserted ids) (DM.fromList $ map viewUniqId' hs')
186
187 docsWithNgrams <- documentIdWithNgrams (extractNgramsT lang) documentsWithId
188
189 let maps = mapNodeIdNgrams docsWithNgrams
190
191 terms2id <- insertNgrams $ DM.keys maps
192 let indexedNgrams = DM.mapKeys (indexNgrams terms2id) maps
193 _ <- insertToNodeNgrams indexedNgrams
194 pure $ map reId ids
195
196
197
198 type CorpusName = Text
199
200 getOrMkRootWithCorpus :: (HasNodeError err, MkCorpus a)
201 => Username -> CorpusName -> Maybe a
202 -> Cmd err (UserId, RootId, CorpusId)
203 getOrMkRootWithCorpus username cName c = do
204 maybeUserId <- getUser username
205 userId <- case maybeUserId of
206 Nothing -> nodeError NoUserFound
207 Just user -> pure $ userLight_id user
208
209 rootId' <- map _node_id <$> getRoot username
210
211 rootId'' <- case rootId' of
212 [] -> mkRoot username userId
213 n -> case length n >= 2 of
214 True -> nodeError ManyNodeUsers
215 False -> pure rootId'
216
217 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
218
219 corpusId'' <- if username == userMaster
220 then do
221 ns <- getCorporaWithParentId rootId
222 pure $ map _node_id ns
223 else
224 pure []
225
226 corpusId' <- if corpusId'' /= []
227 then pure corpusId''
228 else mk (Just cName) c rootId userId
229
230 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
231
232 pure (userId, rootId, corpusId)
233
234
235 ------------------------------------------------------------------------
236
237
238 class UniqId a
239 where
240 uniqId :: Lens' a (Maybe HashId)
241
242
243 instance UniqId HyperdataDocument
244 where
245 uniqId = hyperdataDocument_uniqId
246
247 instance UniqId HyperdataContact
248 where
249 uniqId = hc_uniqId
250
251 viewUniqId' :: UniqId a => a -> (HashId, a)
252 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
253 where
254 err = panic "[ERROR] Database.Flow.toInsert"
255
256
257 toInserted :: [ReturnId] -> Map HashId ReturnId
258 toInserted = DM.fromList . map (\r -> (reUniqId r, r) )
259 . filter (\r -> reInserted r == True)
260
261 data DocumentWithId a = DocumentWithId
262 { documentId :: !NodeId
263 , documentData :: !a
264 } deriving (Show)
265
266 mergeData :: Map HashId ReturnId
267 -> Map HashId a
268 -> [DocumentWithId a]
269 mergeData rs = catMaybes . map toDocumentWithId . DM.toList
270 where
271 toDocumentWithId (hash,hpd) =
272 DocumentWithId <$> fmap reId (lookup hash rs)
273 <*> Just hpd
274
275 ------------------------------------------------------------------------
276 data DocumentIdWithNgrams a = DocumentIdWithNgrams
277 { documentWithId :: !(DocumentWithId a)
278 , document_ngrams :: !(Map Ngrams (Map NgramsType Int))
279 } deriving (Show)
280
281 -- TODO extractNgrams according to Type of Data
282
283 class ExtractNgramsT h
284 where
285 extractNgramsT :: TermType Lang -> h -> Cmd err (Map Ngrams (Map NgramsType Int))
286
287
288 instance ExtractNgramsT HyperdataContact
289 where
290 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
291 where
292 extract :: TermType Lang -> HyperdataContact
293 -> Cmd err (Map Ngrams (Map NgramsType Int))
294 extract _l hc' = do
295 let authors = map text2ngrams
296 $ maybe ["Nothing"] (\a -> [a])
297 $ view (hc_who . _Just . cw_lastName) hc'
298
299 pure $ DM.fromList $ [(a', DM.singleton Authors 1) | a' <- authors ]
300
301
302
303
304 instance ExtractNgramsT HyperdataDocument
305 where
306 extractNgramsT = extractNgramsT'
307
308 extractNgramsT' :: TermType Lang -> HyperdataDocument
309 -> Cmd err (Map Ngrams (Map NgramsType Int))
310 extractNgramsT' lang hd = filterNgramsT 255 <$> extractNgramsT'' lang hd
311 where
312 extractNgramsT'' :: TermType Lang -> HyperdataDocument
313 -> Cmd err (Map Ngrams (Map NgramsType Int))
314 extractNgramsT'' lang' doc = do
315 let source = text2ngrams
316 $ maybe "Nothing" identity
317 $ _hyperdataDocument_source doc
318
319 institutes = map text2ngrams
320 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
321 $ _hyperdataDocument_institutes doc
322
323 authors = map text2ngrams
324 $ maybe ["Nothing"] (splitOn ", ")
325 $ _hyperdataDocument_authors doc
326
327 leText = catMaybes [ _hyperdataDocument_title doc
328 , _hyperdataDocument_abstract doc
329 ]
330
331 terms' <- map text2ngrams
332 <$> map (intercalate " " . _terms_label)
333 <$> concat
334 <$> liftIO (extractTerms lang' leText)
335
336 pure $ DM.fromList $ [(source, DM.singleton Sources 1)]
337 <> [(i', DM.singleton Institutes 1) | i' <- institutes ]
338 <> [(a', DM.singleton Authors 1) | a' <- authors ]
339 <> [(t', DM.singleton NgramsTerms 1) | t' <- terms' ]
340
341
342 filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
343 -> Map Ngrams (Map NgramsType Int)
344 filterNgramsT s ms = DM.fromList $ map (\a -> filter' s a) $ DM.toList ms
345 where
346 filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
347 True -> (ng,y)
348 False -> (Ngrams (Text.take s' t) n , y)
349
350
351 documentIdWithNgrams :: HasNodeError err
352 => (a
353 -> Cmd err (Map Ngrams (Map NgramsType Int)))
354 -> [DocumentWithId a]
355 -> Cmd err [DocumentIdWithNgrams a]
356 documentIdWithNgrams f = mapM toDocumentIdWithNgrams
357 where
358 toDocumentIdWithNgrams d = do
359 e <- f $ documentData d
360 pure $ DocumentIdWithNgrams d e
361
362
363
364 -- FLOW LIST
365 -- | TODO check optimization
366 mapNodeIdNgrams :: [DocumentIdWithNgrams a]
367 -> Map Ngrams (Map NgramsType (Map NodeId Int))
368 mapNodeIdNgrams = DM.unionsWith (DM.unionWith (DM.unionWith (+))) . fmap f
369 where
370 f :: DocumentIdWithNgrams a
371 -> Map Ngrams (Map NgramsType (Map NodeId Int))
372 f d = fmap (fmap (DM.singleton nId)) $ document_ngrams d
373 where
374 nId = documentId $ documentWithId d
375
376 ------------------------------------------------------------------------
377 listInsert :: FlowCmdM env err m
378 => ListId -> Map NgramsType [NgramsElement]
379 -> m ()
380 listInsert lId ngs = mapM_ (\(typeList, ngElmts)
381 -> putListNgrams lId typeList ngElmts
382 ) $ toList ngs
383
384 flowList :: FlowCmdM env err m => UserId -> CorpusId
385 -> Map NgramsType [NgramsElement]
386 -> m ListId
387 flowList uId cId ngs = do
388 lId <- getOrMkList cId uId
389 printDebug "listId flowList" lId
390 listInsert lId ngs
391 pure lId
392