]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Flow.hs
[Metrics] optim
[gargantext.git] / src / Gargantext / Database / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -- TODO-ACCESS:
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
13
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
16 -}
17
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
19
20 {-# LANGUAGE ConstraintKinds #-}
21 {-# LANGUAGE RankNTypes #-}
22 {-# LANGUAGE ConstrainedClassMethods #-}
23 {-# LANGUAGE ConstraintKinds #-}
24 {-# LANGUAGE DeriveGeneric #-}
25 {-# LANGUAGE FlexibleContexts #-}
26 {-# LANGUAGE InstanceSigs #-}
27 {-# LANGUAGE NoImplicitPrelude #-}
28 {-# LANGUAGE OverloadedStrings #-}
29
30 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
31 ( FlowCmdM
32 , flowCorpusFile
33 , flowCorpus
34 , flowCorpusSearchInDatabase
35 , getOrMkRoot
36 , getOrMkRootWithCorpus
37 , flowAnnuaire
38 )
39 where
40 import Prelude (String)
41 import Data.Either
42 import Debug.Trace (trace)
43 import Control.Lens ((^.), view, _Just)
44 import Control.Monad (mapM_)
45 import Control.Monad.IO.Class (liftIO)
46 import Data.List (concat)
47 import Data.Map (Map, lookup, toList)
48 import Data.Maybe (Maybe(..), catMaybes)
49 import Data.Monoid
50 import Data.Text (Text, splitOn, intercalate)
51 import GHC.Show (Show)
52 import Gargantext.API.Ngrams (HasRepoVar)
53 import Gargantext.API.Ngrams (NgramsElement, putListNgrams, RepoCmdM)
54 import Gargantext.Core (Lang(..))
55 import Gargantext.Core.Types (NodePoly(..), Terms(..))
56 import Gargantext.Core.Types.Individu (Username)
57 import Gargantext.Core.Flow
58 import Gargantext.Core.Types.Main
59 import Gargantext.Database.Config (userMaster, corpusMasterName)
60 import Gargantext.Database.Flow.Utils (insertDocNgrams)
61 import Gargantext.Database.Node.Contact -- (HyperdataContact(..), ContactWho(..))
62 import Gargantext.Database.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
63 import Gargantext.Database.Root (getRoot)
64 import Gargantext.Database.Schema.Ngrams -- (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
65 import Gargantext.Database.Schema.Node -- (mkRoot, mkCorpus, getOrMkList, mkGraph, {-mkPhylo,-} mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
66 import Gargantext.Database.Schema.User (getUser, UserLight(..))
67 import Gargantext.Database.TextSearch (searchInDatabase)
68 import Gargantext.Database.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
69 import Gargantext.Database.Utils (Cmd, CmdM)
70 import Gargantext.Database.Triggers
71 import Gargantext.Ext.IMT (toSchoolName)
72 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
73 import Gargantext.Prelude
74 import Gargantext.Text.Terms.Eleve (buildTries, toToken)
75 import Gargantext.Text.List (buildNgramsLists,StopSize(..))
76 import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat)
77 import qualified Gargantext.Text.Corpus.API.Isidore as Isidore
78 import Gargantext.Text.Terms (TermType(..), tt_lang, extractTerms, uniText)
79 import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
80 import Gargantext.Prelude.Utils hiding (sha)
81 import System.FilePath (FilePath)
82 import qualified Data.List as List
83 import qualified Data.Map as Map
84 import qualified Data.Text as Text
85 import qualified Gargantext.Database.Node.Document.Add as Doc (add)
86 import qualified Gargantext.Text.Corpus.Parsers.GrandDebat as GD
87
88 type FlowCmdM env err m =
89 ( CmdM env err m
90 , RepoCmdM env err m
91 , HasNodeError err
92 , HasRepoVar env
93 )
94
95 ------------------------------------------------------------------------
96
97 data ApiQuery = ApiIsidoreQuery Text | ApiIsidoreAuth Text
98 -- | APIs
99 -- TODO instances
100 getDataApi :: Lang
101 -> Maybe Limit
102 -> ApiQuery
103 -> IO [HyperdataDocument]
104 getDataApi lang limit (ApiIsidoreQuery q) = Isidore.get lang limit (Just q) Nothing
105 getDataApi lang limit (ApiIsidoreAuth q) = Isidore.get lang limit Nothing (Just q)
106
107
108 -- UNUSED
109 _flowCorpusApi :: ( FlowCmdM env err m)
110 => Username -> Either CorpusName [CorpusId]
111 -> TermType Lang
112 -> Maybe Limit
113 -> ApiQuery
114 -> m CorpusId
115 _flowCorpusApi u n tt l q = do
116 docs <- liftIO $ splitEvery 500 <$> getDataApi (_tt_lang tt) l q
117 flowCorpus u n tt docs
118
119 ------------------------------------------------------------------------
120
121 flowAnnuaire :: FlowCmdM env err m
122 => Username
123 -> Either CorpusName [CorpusId]
124 -> (TermType Lang)
125 -> FilePath
126 -> m AnnuaireId
127 flowAnnuaire u n l filePath = do
128 docs <- liftIO $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
129 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
130
131 -- UNUSED
132 _flowCorpusDebat :: FlowCmdM env err m
133 => Username -> Either CorpusName [CorpusId]
134 -> Limit -> FilePath
135 -> m CorpusId
136 _flowCorpusDebat u n l fp = do
137 docs <- liftIO ( splitEvery 500
138 <$> take l
139 <$> readFile' fp
140 :: IO [[GD.GrandDebatReference ]]
141 )
142 flowCorpus u n (Multi FR) (map (map toHyperdataDocument) docs)
143
144 flowCorpusFile :: FlowCmdM env err m
145 => Username -> Either CorpusName [CorpusId]
146 -> Limit -- Limit the number of docs (for dev purpose)
147 -> TermType Lang -> FileFormat -> FilePath
148 -> m CorpusId
149 flowCorpusFile u n l la ff fp = do
150 docs <- liftIO ( splitEvery 500
151 <$> take l
152 <$> parseFile ff fp
153 )
154 flowCorpus u n la (map (map toHyperdataDocument) docs)
155
156 -- TODO query with complex query
157 flowCorpusSearchInDatabase :: FlowCmdM env err m
158 => Username
159 -> Lang
160 -> Text
161 -> m CorpusId
162 flowCorpusSearchInDatabase u la q = do
163 (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus
164 userMaster
165 (Left "")
166 (Nothing :: Maybe HyperdataCorpus)
167 ids <- map fst <$> searchInDatabase cId (stemIt q)
168 flowCorpusUser la u (Left q) (Nothing :: Maybe HyperdataCorpus) ids
169
170
171 -- UNUSED
172 _flowCorpusSearchInDatabaseApi :: FlowCmdM env err m
173 => Username
174 -> Lang
175 -> Text
176 -> m CorpusId
177 _flowCorpusSearchInDatabaseApi u la q = do
178 (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus
179 userMaster
180 (Left "")
181 (Nothing :: Maybe HyperdataCorpus)
182 ids <- map fst <$> searchInDatabase cId (stemIt q)
183 flowCorpusUser la u (Left q) (Nothing :: Maybe HyperdataCorpus) ids
184
185 ------------------------------------------------------------------------
186 -- | TODO improve the needed type to create/update a corpus
187 {- UNUSED
188 data UserInfo = Username Text
189 | UserId NodeId
190 data CorpusInfo = CorpusName Lang Text
191 | CorpusId Lang NodeId
192 -}
193
194 flow :: (FlowCmdM env err m, FlowCorpus a, MkCorpus c)
195 => Maybe c
196 -> Username
197 -> Either CorpusName [CorpusId]
198 -> TermType Lang
199 -> [[a]]
200 -> m CorpusId
201 flow c u cn la docs = do
202 ids <- mapM (insertMasterDocs c la ) docs
203 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
204
205 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
206 => Username
207 -> Either CorpusName [CorpusId]
208 -> TermType Lang
209 -> [[a]]
210 -> m CorpusId
211 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
212
213 ------------------------------------------------------------------------
214 flowCorpusUser :: (FlowCmdM env err m, MkCorpus c)
215 => Lang
216 -> Username
217 -> Either CorpusName [CorpusId]
218 -> Maybe c
219 -> [NodeId]
220 -> m CorpusId
221 flowCorpusUser l userName corpusName ctype ids = do
222 -- User Flow
223 (userId, _rootId, userCorpusId) <- getOrMkRootWithCorpus userName corpusName ctype
224 -- TODO: check if present already, ignore
225 _ <- Doc.add userCorpusId ids
226 tId <- mkNode NodeTexts userCorpusId userId
227
228 printDebug "Node Text Id" tId
229
230 -- User List Flow
231 --{-
232 (masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster (Left "") ctype
233 ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
234 userListId <- flowList userId userCorpusId ngs
235 mastListId <- getOrMkList masterCorpusId masterUserId
236 _ <- insertOccsUpdates userCorpusId mastListId
237 printDebug "userListId" userListId
238 -- User Graph Flow
239 _ <- mkDashboard userCorpusId userId
240 _ <- mkGraph userCorpusId userId
241 --_ <- mkPhylo userCorpusId userId
242 --}
243
244
245 -- Annuaire Flow
246 -- _ <- mkAnnuaire rootUserId userId
247 pure userCorpusId
248
249
250 insertMasterDocs :: ( FlowCmdM env err m
251 , FlowCorpus a
252 , MkCorpus c
253 )
254 => Maybe c
255 -> TermType Lang
256 -> [a]
257 -> m [DocId]
258 insertMasterDocs c lang hs = do
259 (masterUserId, _, masterCorpusId) <- getOrMkRootWithCorpus userMaster (Left corpusMasterName) c
260
261 -- TODO Type NodeDocumentUnicised
262 let hs' = map addUniqId hs
263 ids <- insertDb masterUserId masterCorpusId hs'
264 let documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' hs')
265
266 let
267 fixLang (Unsupervised l n s m) = Unsupervised l n s m'
268 where
269 m' = case m of
270 Nothing -> trace ("buildTries here" :: String)
271 $ Just
272 $ buildTries n ( fmap toToken $ uniText
273 $ Text.intercalate " . "
274 $ List.concat
275 $ map hasText documentsWithId
276 )
277 just_m -> just_m
278 fixLang l = l
279
280 lang' = fixLang lang
281 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
282 maps <- mapNodeIdNgrams <$> documentIdWithNgrams (extractNgramsT lang') documentsWithId
283 terms2id <- insertNgrams $ Map.keys maps
284 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) maps
285
286 lId <- getOrMkList masterCorpusId masterUserId
287 _ <- insertDocNgrams lId indexedNgrams
288
289 pure $ map reId ids
290
291
292 type CorpusName = Text
293
294
295 getOrMkRoot :: (HasNodeError err)
296 => Username
297 -> Cmd err (UserId, RootId)
298 getOrMkRoot username = do
299 maybeUserId <- getUser username
300 userId <- case maybeUserId of
301 Nothing -> nodeError NoUserFound
302 Just user -> pure $ userLight_id user
303
304 rootId' <- map _node_id <$> getRoot username
305
306 rootId'' <- case rootId' of
307 [] -> mkRoot username userId
308 n -> case length n >= 2 of
309 True -> nodeError ManyNodeUsers
310 False -> pure rootId'
311
312 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
313 pure (userId, rootId)
314
315
316 getOrMkRootWithCorpus :: (HasNodeError err, MkCorpus a)
317 => Username
318 -> Either CorpusName [CorpusId]
319 -> Maybe a
320 -> Cmd err (UserId, RootId, CorpusId)
321 getOrMkRootWithCorpus username cName c = do
322 (userId, rootId) <- getOrMkRoot username
323 corpusId'' <- if username == userMaster
324 then do
325 ns <- getCorporaWithParentId rootId
326 pure $ map _node_id ns
327 else
328 pure $ fromRight [] cName
329
330 corpusId' <- if corpusId'' /= []
331 then pure corpusId''
332 else mk (Just $ fromLeft "Default" cName) c rootId userId
333
334 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
335
336 pure (userId, rootId, corpusId)
337
338
339 ------------------------------------------------------------------------
340 viewUniqId' :: UniqId a
341 => a
342 -> (HashId, a)
343 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
344 where
345 err = panic "[ERROR] Database.Flow.toInsert"
346
347
348 toInserted :: [ReturnId]
349 -> Map HashId ReturnId
350 toInserted =
351 Map.fromList . map (\r -> (reUniqId r, r) )
352 . filter (\r -> reInserted r == True)
353
354 data DocumentWithId a = DocumentWithId
355 { documentId :: !NodeId
356 , documentData :: !a
357 } deriving (Show)
358
359 instance HasText a => HasText (DocumentWithId a)
360 where
361 hasText (DocumentWithId _ a) = hasText a
362
363 mergeData :: Map HashId ReturnId
364 -> Map HashId a
365 -> [DocumentWithId a]
366 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
367 where
368 toDocumentWithId (sha,hpd) =
369 DocumentWithId <$> fmap reId (lookup sha rs)
370 <*> Just hpd
371
372 ------------------------------------------------------------------------
373 data DocumentIdWithNgrams a = DocumentIdWithNgrams
374 { documentWithId :: !(DocumentWithId a)
375 , document_ngrams :: !(Map Ngrams (Map NgramsType Int))
376 } deriving (Show)
377
378
379 instance HasText HyperdataContact
380 where
381 hasText = undefined
382
383 instance ExtractNgramsT HyperdataContact
384 where
385 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
386 where
387 extract :: TermType Lang -> HyperdataContact
388 -> Cmd err (Map Ngrams (Map NgramsType Int))
389 extract _l hc' = do
390 let authors = map text2ngrams
391 $ maybe ["Nothing"] (\a -> [a])
392 $ view (hc_who . _Just . cw_lastName) hc'
393
394 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
395
396 instance HasText HyperdataDocument
397 where
398 hasText h = catMaybes [ _hyperdataDocument_title h
399 , _hyperdataDocument_abstract h
400 ]
401
402 instance ExtractNgramsT HyperdataDocument
403 where
404 extractNgramsT :: TermType Lang
405 -> HyperdataDocument
406 -> Cmd err (Map Ngrams (Map NgramsType Int))
407 extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
408 where
409 extractNgramsT' :: TermType Lang
410 -> HyperdataDocument
411 -> Cmd err (Map Ngrams (Map NgramsType Int))
412 extractNgramsT' lang' doc = do
413 let source = text2ngrams
414 $ maybe "Nothing" identity
415 $ _hyperdataDocument_source doc
416
417 institutes = map text2ngrams
418 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
419 $ _hyperdataDocument_institutes doc
420
421 authors = map text2ngrams
422 $ maybe ["Nothing"] (splitOn ", ")
423 $ _hyperdataDocument_authors doc
424
425 terms' <- map text2ngrams
426 <$> map (intercalate " " . _terms_label)
427 <$> concat
428 <$> liftIO (extractTerms lang' $ hasText doc)
429
430 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
431 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
432 <> [(a', Map.singleton Authors 1) | a' <- authors ]
433 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
434
435 filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
436 -> Map Ngrams (Map NgramsType Int)
437 filterNgramsT s ms = Map.fromList $ map (\a -> filter' s a) $ Map.toList ms
438 where
439 filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
440 True -> (ng,y)
441 False -> (Ngrams (Text.take s' t) n , y)
442
443
444 documentIdWithNgrams :: HasNodeError err
445 => (a
446 -> Cmd err (Map Ngrams (Map NgramsType Int)))
447 -> [DocumentWithId a]
448 -> Cmd err [DocumentIdWithNgrams a]
449 documentIdWithNgrams f = mapM toDocumentIdWithNgrams
450 where
451 toDocumentIdWithNgrams d = do
452 e <- f $ documentData d
453 pure $ DocumentIdWithNgrams d e
454
455
456 -- FLOW LIST
457 -- | TODO check optimization
458 mapNodeIdNgrams :: [DocumentIdWithNgrams a]
459 -> Map Ngrams (Map NgramsType (Map NodeId Int))
460 mapNodeIdNgrams = Map.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
461 where
462 f :: DocumentIdWithNgrams a
463 -> Map Ngrams (Map NgramsType (Map NodeId Int))
464 f d = fmap (fmap (Map.singleton nId)) $ document_ngrams d
465 where
466 nId = documentId $ documentWithId d
467
468 ------------------------------------------------------------------------
469 listInsert :: FlowCmdM env err m
470 => ListId
471 -> Map NgramsType [NgramsElement]
472 -> m ()
473 listInsert lId ngs = mapM_ (\(typeList, ngElmts)
474 -> putListNgrams lId typeList ngElmts
475 ) $ toList ngs
476
477 flowList :: FlowCmdM env err m
478 => UserId
479 -> CorpusId
480 -> Map NgramsType [NgramsElement]
481 -> m ListId
482 flowList uId cId ngs = do
483 lId <- getOrMkList cId uId
484 printDebug "listId flowList" lId
485 listInsert lId ngs
486 --trace (show $ List.filter (\n -> _ne_ngrams n == "versatile") $ List.concat $ Map.elems ngs) $ listInsert lId ngs
487 pure lId
488