]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Flow.hs
[DB] Master User Texts
[gargantext.git] / src / Gargantext / Database / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -- TODO-ACCESS:
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
13
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
16 -}
17
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
19
20 {-# LANGUAGE ConstraintKinds #-}
21 {-# LANGUAGE RankNTypes #-}
22 {-# LANGUAGE ConstrainedClassMethods #-}
23 {-# LANGUAGE ConstraintKinds #-}
24 {-# LANGUAGE DeriveGeneric #-}
25 {-# LANGUAGE FlexibleContexts #-}
26 {-# LANGUAGE InstanceSigs #-}
27 {-# LANGUAGE NoImplicitPrelude #-}
28 {-# LANGUAGE OverloadedStrings #-}
29
30 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
31 ( FlowCmdM
32 , flowCorpusFile
33 , flowCorpus
34 , flowCorpusSearchInDatabase
35 , getOrMkRoot
36 , getOrMk_RootWithCorpus
37 , flowAnnuaire
38 )
39 where
40 import Prelude (String)
41 import Data.Either
42 import Data.Traversable (traverse)
43 import Debug.Trace (trace)
44 import Control.Lens ((^.), view, _Just)
45 import Control.Monad.IO.Class (liftIO)
46 import Data.List (concat)
47 import Data.Map (Map, lookup)
48 import Data.Maybe (Maybe(..), catMaybes)
49 import Data.Monoid
50 import Data.Text (Text, splitOn, intercalate)
51 import Gargantext.Core (Lang(..))
52 import Gargantext.Core.Types (NodePoly(..), Terms(..))
53 import Gargantext.Core.Types.Individu (Username)
54 import Gargantext.Core.Flow.Types
55 import Gargantext.Core.Types.Main
56 import Gargantext.Database.Config (userMaster, corpusMasterName)
57 import Gargantext.Database.Flow.Utils (insertDocNgrams)
58 import Gargantext.Database.Flow.List
59 import Gargantext.Database.Flow.Types
60 import Gargantext.Database.Node.Contact -- (HyperdataContact(..), ContactWho(..))
61 import Gargantext.Database.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
62 import Gargantext.Database.Root (getRoot)
63
64 import Gargantext.Database.Schema.Ngrams -- (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
65 import Gargantext.Database.Schema.Node -- (mkRoot, mkCorpus, getOrMkList, mkGraph, {-mkPhylo,-} mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
66
67 import Gargantext.Database.Schema.User (getUser, UserLight(..))
68 import Gargantext.Database.TextSearch (searchInDatabase)
69 import Gargantext.Database.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
70 import Gargantext.Database.Utils (Cmd)
71 import Gargantext.Ext.IMT (toSchoolName)
72 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
73 import Gargantext.Prelude
74 import Gargantext.Text.Terms.Eleve (buildTries, toToken)
75 import Gargantext.Text.List (buildNgramsLists,StopSize(..))
76 import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat)
77 import qualified Gargantext.Text.Corpus.API.Isidore as Isidore
78 import Gargantext.Text.Terms (TermType(..), tt_lang, extractTerms, uniText)
79 import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
80 import Gargantext.Prelude.Utils hiding (sha)
81 import System.FilePath (FilePath)
82 import qualified Data.List as List
83 import qualified Data.Map as Map
84 import qualified Data.Text as Text
85 import qualified Gargantext.Database.Node.Document.Add as Doc (add)
86 import qualified Gargantext.Text.Corpus.Parsers.GrandDebat as GD
87
88 ------------------------------------------------------------------------
89
90 data ApiQuery = ApiIsidoreQuery Text | ApiIsidoreAuth Text
91 -- | APIs
92 -- TODO instances
93 getDataApi :: Lang
94 -> Maybe Limit
95 -> ApiQuery
96 -> IO [HyperdataDocument]
97 getDataApi lang limit (ApiIsidoreQuery q) = Isidore.get lang limit (Just q) Nothing
98 getDataApi lang limit (ApiIsidoreAuth q) = Isidore.get lang limit Nothing (Just q)
99
100
101 -- UNUSED
102 _flowCorpusApi :: ( FlowCmdM env err m)
103 => Username -> Either CorpusName [CorpusId]
104 -> TermType Lang
105 -> Maybe Limit
106 -> ApiQuery
107 -> m CorpusId
108 _flowCorpusApi u n tt l q = do
109 docs <- liftIO $ splitEvery 500 <$> getDataApi (_tt_lang tt) l q
110 flowCorpus u n tt docs
111
112 ------------------------------------------------------------------------
113
114 flowAnnuaire :: FlowCmdM env err m
115 => Username
116 -> Either CorpusName [CorpusId]
117 -> (TermType Lang)
118 -> FilePath
119 -> m AnnuaireId
120 flowAnnuaire u n l filePath = do
121 docs <- liftIO $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
122 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
123
124 -- UNUSED
125 _flowCorpusDebat :: FlowCmdM env err m
126 => Username -> Either CorpusName [CorpusId]
127 -> Limit -> FilePath
128 -> m CorpusId
129 _flowCorpusDebat u n l fp = do
130 docs <- liftIO ( splitEvery 500
131 <$> take l
132 <$> readFile' fp
133 :: IO [[GD.GrandDebatReference ]]
134 )
135 flowCorpus u n (Multi FR) (map (map toHyperdataDocument) docs)
136
137 flowCorpusFile :: FlowCmdM env err m
138 => Username -> Either CorpusName [CorpusId]
139 -> Limit -- Limit the number of docs (for dev purpose)
140 -> TermType Lang -> FileFormat -> FilePath
141 -> m CorpusId
142 flowCorpusFile u n l la ff fp = do
143 docs <- liftIO ( splitEvery 500
144 <$> take l
145 <$> parseFile ff fp
146 )
147 flowCorpus u n la (map (map toHyperdataDocument) docs)
148
149 -- TODO query with complex query
150 flowCorpusSearchInDatabase :: FlowCmdM env err m
151 => Username
152 -> Lang
153 -> Text
154 -> m CorpusId
155 flowCorpusSearchInDatabase u la q = do
156 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
157 userMaster
158 (Left "")
159 (Nothing :: Maybe HyperdataCorpus)
160 ids <- map fst <$> searchInDatabase cId (stemIt q)
161 flowCorpusUser la u (Left q) (Nothing :: Maybe HyperdataCorpus) ids
162
163
164 -- UNUSED
165 _flowCorpusSearchInDatabaseApi :: FlowCmdM env err m
166 => Username
167 -> Lang
168 -> Text
169 -> m CorpusId
170 _flowCorpusSearchInDatabaseApi u la q = do
171 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
172 userMaster
173 (Left "")
174 (Nothing :: Maybe HyperdataCorpus)
175 ids <- map fst <$> searchInDatabase cId (stemIt q)
176 flowCorpusUser la u (Left q) (Nothing :: Maybe HyperdataCorpus) ids
177
178 ------------------------------------------------------------------------
179 -- | TODO improve the needed type to create/update a corpus
180 {- UNUSED
181 data UserInfo = Username Text
182 | UserId NodeId
183 data CorpusInfo = CorpusName Lang Text
184 | CorpusId Lang NodeId
185 -}
186
187 flow :: (FlowCmdM env err m, FlowCorpus a, MkCorpus c)
188 => Maybe c
189 -> Username
190 -> Either CorpusName [CorpusId]
191 -> TermType Lang
192 -> [[a]]
193 -> m CorpusId
194 flow c u cn la docs = do
195 ids <- traverse (insertMasterDocs c la ) docs
196 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
197
198 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
199 => Username
200 -> Either CorpusName [CorpusId]
201 -> TermType Lang
202 -> [[a]]
203 -> m CorpusId
204 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
205
206 ------------------------------------------------------------------------
207 flowCorpusUser :: (FlowCmdM env err m, MkCorpus c)
208 => Lang
209 -> Username
210 -> Either CorpusName [CorpusId]
211 -> Maybe c
212 -> [NodeId]
213 -> m CorpusId
214 flowCorpusUser l userName corpusName ctype ids = do
215 -- User Flow
216 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus userName corpusName ctype
217 listId <- getOrMkList userCorpusId userId
218 _cooc <- mkNode NodeListCooc listId userId
219 -- TODO: check if present already, ignore
220 _ <- Doc.add userCorpusId ids
221
222 _tId <- mkNode NodeTexts userCorpusId userId
223 -- printDebug "Node Text Id" tId
224
225 -- User List Flow
226 (_masterUserId, _masterRootId, masterCorpusId) <- getOrMk_RootWithCorpus userMaster (Left "") ctype
227 ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
228 _userListId <- flowList_DbRepo listId ngs
229 --mastListId <- getOrMkList masterCorpusId masterUserId
230 -- _ <- insertOccsUpdates userCorpusId mastListId
231 -- printDebug "userListId" userListId
232 -- User Graph Flow
233 _ <- mkDashboard userCorpusId userId
234 _ <- mkGraph userCorpusId userId
235 --_ <- mkPhylo userCorpusId userId
236
237 -- Annuaire Flow
238 -- _ <- mkAnnuaire rootUserId userId
239 pure userCorpusId
240
241
242 insertMasterDocs :: ( FlowCmdM env err m
243 , FlowCorpus a
244 , MkCorpus c
245 )
246 => Maybe c
247 -> TermType Lang
248 -> [a]
249 -> m [DocId]
250 insertMasterDocs c lang hs = do
251 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus userMaster (Left corpusMasterName) c
252
253 -- TODO Type NodeDocumentUnicised
254 let docs = map addUniqId hs
255 ids <- insertDb masterUserId masterCorpusId docs
256 let
257 ids' = map reId ids
258 documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' docs)
259 _ <- Doc.add masterCorpusId ids'
260 -- TODO
261 -- create a corpus with database name (CSV or PubMed)
262 -- add documents to the corpus (create node_node link)
263 -- this will enable global database monitoring
264
265 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
266 maps <- mapNodeIdNgrams
267 <$> documentIdWithNgrams (extractNgramsT $ withLang lang documentsWithId) documentsWithId
268
269 lId <- getOrMkList masterCorpusId masterUserId
270 terms2id <- insertNgrams $ Map.keys maps
271 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) maps
272
273 _cooc <- mkNode NodeListCooc lId masterUserId
274 _ <- insertDocNgrams lId indexedNgrams
275
276 pure ids'
277
278
279 withLang :: HasText a => TermType Lang
280 -> [DocumentWithId a]
281 -> TermType Lang
282 withLang (Unsupervised l n s m) ns = Unsupervised l n s m'
283 where
284 m' = case m of
285 Nothing -> trace ("buildTries here" :: String)
286 $ Just
287 $ buildTries n ( fmap toToken $ uniText
288 $ Text.intercalate " . "
289 $ List.concat
290 $ map hasText ns
291 )
292 just_m -> just_m
293 withLang l _ = l
294
295
296
297 type CorpusName = Text
298
299
300 getOrMkRoot :: (HasNodeError err)
301 => Username
302 -> Cmd err (UserId, RootId)
303 getOrMkRoot username = do
304 maybeUserId <- getUser username
305 userId <- case maybeUserId of
306 Nothing -> nodeError NoUserFound
307 Just user -> pure $ userLight_id user
308
309 rootId' <- map _node_id <$> getRoot username
310
311 rootId'' <- case rootId' of
312 [] -> mkRoot username userId
313 n -> case length n >= 2 of
314 True -> nodeError ManyNodeUsers
315 False -> pure rootId'
316
317 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
318 pure (userId, rootId)
319
320
321 getOrMk_RootWithCorpus :: (HasNodeError err, MkCorpus a)
322 => Username
323 -> Either CorpusName [CorpusId]
324 -> Maybe a
325 -> Cmd err (UserId, RootId, CorpusId)
326 getOrMk_RootWithCorpus username cName c = do
327 (userId, rootId) <- getOrMkRoot username
328 corpusId'' <- if username == userMaster
329 then do
330 ns <- getCorporaWithParentId rootId
331 pure $ map _node_id ns
332 else
333 pure $ fromRight [] cName
334
335 corpusId' <- if corpusId'' /= []
336 then pure corpusId''
337 else do
338 c' <- mk (Just $ fromLeft "Default" cName) c rootId userId
339 _tId <- case head c' of
340 Nothing -> pure [0]
341 Just c'' -> mkNode NodeTexts c'' userId
342 pure c'
343
344 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
345
346 pure (userId, rootId, corpusId)
347
348
349 ------------------------------------------------------------------------
350 viewUniqId' :: UniqId a
351 => a
352 -> (HashId, a)
353 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
354 where
355 err = panic "[ERROR] Database.Flow.toInsert"
356
357
358 toInserted :: [ReturnId]
359 -> Map HashId ReturnId
360 toInserted =
361 Map.fromList . map (\r -> (reUniqId r, r) )
362 . filter (\r -> reInserted r == True)
363
364 mergeData :: Map HashId ReturnId
365 -> Map HashId a
366 -> [DocumentWithId a]
367 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
368 where
369 toDocumentWithId (sha,hpd) =
370 DocumentWithId <$> fmap reId (lookup sha rs)
371 <*> Just hpd
372
373 ------------------------------------------------------------------------
374
375 instance HasText HyperdataContact
376 where
377 hasText = undefined
378
379 instance ExtractNgramsT HyperdataContact
380 where
381 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
382 where
383 extract :: TermType Lang -> HyperdataContact
384 -> Cmd err (Map Ngrams (Map NgramsType Int))
385 extract _l hc' = do
386 let authors = map text2ngrams
387 $ maybe ["Nothing"] (\a -> [a])
388 $ view (hc_who . _Just . cw_lastName) hc'
389
390 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
391
392 instance HasText HyperdataDocument
393 where
394 hasText h = catMaybes [ _hyperdataDocument_title h
395 , _hyperdataDocument_abstract h
396 ]
397
398 instance ExtractNgramsT HyperdataDocument
399 where
400 extractNgramsT :: TermType Lang
401 -> HyperdataDocument
402 -> Cmd err (Map Ngrams (Map NgramsType Int))
403 extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
404 where
405 extractNgramsT' :: TermType Lang
406 -> HyperdataDocument
407 -> Cmd err (Map Ngrams (Map NgramsType Int))
408 extractNgramsT' lang' doc = do
409 let source = text2ngrams
410 $ maybe "Nothing" identity
411 $ _hyperdataDocument_source doc
412
413 institutes = map text2ngrams
414 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
415 $ _hyperdataDocument_institutes doc
416
417 authors = map text2ngrams
418 $ maybe ["Nothing"] (splitOn ", ")
419 $ _hyperdataDocument_authors doc
420
421 terms' <- map text2ngrams
422 <$> map (intercalate " " . _terms_label)
423 <$> concat
424 <$> liftIO (extractTerms lang' $ hasText doc)
425
426 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
427 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
428 <> [(a', Map.singleton Authors 1) | a' <- authors ]
429 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
430
431 filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
432 -> Map Ngrams (Map NgramsType Int)
433 filterNgramsT s ms = Map.fromList $ map (\a -> filter' s a) $ Map.toList ms
434 where
435 filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
436 True -> (ng,y)
437 False -> (Ngrams (Text.take s' t) n , y)
438
439
440 documentIdWithNgrams :: HasNodeError err
441 => (a
442 -> Cmd err (Map Ngrams (Map NgramsType Int)))
443 -> [DocumentWithId a]
444 -> Cmd err [DocumentIdWithNgrams a]
445 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
446 where
447 toDocumentIdWithNgrams d = do
448 e <- f $ documentData d
449 pure $ DocumentIdWithNgrams d e
450