]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Flow.hs
Merge branch 'dev'
[gargantext.git] / src / Gargantext / Database / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -- TODO-ACCESS:
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
13
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
16 -}
17
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
19
20 {-# LANGUAGE ConstraintKinds #-}
21 {-# LANGUAGE RankNTypes #-}
22 {-# LANGUAGE ConstrainedClassMethods #-}
23 {-# LANGUAGE ConstraintKinds #-}
24 {-# LANGUAGE DeriveGeneric #-}
25 {-# LANGUAGE FlexibleContexts #-}
26 {-# LANGUAGE InstanceSigs #-}
27 {-# LANGUAGE NoImplicitPrelude #-}
28 {-# LANGUAGE OverloadedStrings #-}
29
30 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
31 ( FlowCmdM
32 , flowCorpusFile
33 , flowCorpus
34 , flowCorpusSearchInDatabase
35 , getOrMkRoot
36 , getOrMkRootWithCorpus
37 , flowAnnuaire
38 )
39 where
40 import Prelude (String)
41 import Data.Either
42 import Debug.Trace (trace)
43 import Control.Lens ((^.), view, _Just)
44 import Control.Monad (mapM_)
45 import Control.Monad.IO.Class (liftIO)
46 import Data.List (concat)
47 import Data.Map (Map, lookup, toList)
48 import Data.Maybe (Maybe(..), catMaybes)
49 import Data.Monoid
50 import Data.Text (Text, splitOn, intercalate)
51 import GHC.Show (Show)
52 import Gargantext.API.Ngrams (HasRepoVar)
53 import Gargantext.API.Ngrams (NgramsElement, putListNgrams, RepoCmdM)
54 import Gargantext.Core (Lang(..))
55 import Gargantext.Core.Types (NodePoly(..), Terms(..))
56 import Gargantext.Core.Types.Individu (Username)
57 import Gargantext.Core.Flow
58 import Gargantext.Core.Types.Main
59 import Gargantext.Database.Config (userMaster, corpusMasterName)
60 import Gargantext.Database.Flow.Utils (insertDocNgrams)
61 import Gargantext.Database.Node.Contact -- (HyperdataContact(..), ContactWho(..))
62 import Gargantext.Database.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
63 import Gargantext.Database.Root (getRoot)
64 import Gargantext.Database.Schema.Ngrams -- (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
65 import Gargantext.Database.Schema.Node -- (mkRoot, mkCorpus, getOrMkList, mkGraph, {-mkPhylo,-} mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
66 import Gargantext.Database.Schema.User (getUser, UserLight(..))
67 import Gargantext.Database.TextSearch (searchInDatabase)
68 import Gargantext.Database.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
69 import Gargantext.Database.Utils (Cmd, CmdM)
70 import Gargantext.Ext.IMT (toSchoolName)
71 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
72 import Gargantext.Prelude
73 import Gargantext.Text.Terms.Eleve (buildTries, toToken)
74 import Gargantext.Text.List (buildNgramsLists,StopSize(..))
75 import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat)
76 import qualified Gargantext.Text.Corpus.API.Isidore as Isidore
77 import Gargantext.Text.Terms (TermType(..), tt_lang, extractTerms, uniText)
78 import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
79 import Gargantext.Prelude.Utils hiding (sha)
80 import System.FilePath (FilePath)
81 import qualified Data.List as List
82 import qualified Data.Map as Map
83 import qualified Data.Text as Text
84 import qualified Gargantext.Database.Node.Document.Add as Doc (add)
85 import qualified Gargantext.Text.Corpus.Parsers.GrandDebat as GD
86
87 type FlowCmdM env err m =
88 ( CmdM env err m
89 , RepoCmdM env err m
90 , HasNodeError err
91 , HasRepoVar env
92 )
93
94 ------------------------------------------------------------------------
95
96 data ApiQuery = ApiIsidoreQuery Text | ApiIsidoreAuth Text
97 -- | APIs
98 -- TODO instances
99 getDataApi :: Lang
100 -> Maybe Limit
101 -> ApiQuery
102 -> IO [HyperdataDocument]
103 getDataApi lang limit (ApiIsidoreQuery q) = Isidore.get lang limit (Just q) Nothing
104 getDataApi lang limit (ApiIsidoreAuth q) = Isidore.get lang limit Nothing (Just q)
105
106
107 -- UNUSED
108 _flowCorpusApi :: ( FlowCmdM env err m)
109 => Username -> Either CorpusName [CorpusId]
110 -> TermType Lang
111 -> Maybe Limit
112 -> ApiQuery
113 -> m CorpusId
114 _flowCorpusApi u n tt l q = do
115 docs <- liftIO $ splitEvery 500 <$> getDataApi (_tt_lang tt) l q
116 flowCorpus u n tt docs
117
118 ------------------------------------------------------------------------
119
120 flowAnnuaire :: FlowCmdM env err m
121 => Username
122 -> Either CorpusName [CorpusId]
123 -> (TermType Lang)
124 -> FilePath
125 -> m AnnuaireId
126 flowAnnuaire u n l filePath = do
127 docs <- liftIO $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
128 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
129
130 -- UNUSED
131 _flowCorpusDebat :: FlowCmdM env err m
132 => Username -> Either CorpusName [CorpusId]
133 -> Limit -> FilePath
134 -> m CorpusId
135 _flowCorpusDebat u n l fp = do
136 docs <- liftIO ( splitEvery 500
137 <$> take l
138 <$> readFile' fp
139 :: IO [[GD.GrandDebatReference ]]
140 )
141 flowCorpus u n (Multi FR) (map (map toHyperdataDocument) docs)
142
143 flowCorpusFile :: FlowCmdM env err m
144 => Username -> Either CorpusName [CorpusId]
145 -> Limit -- Limit the number of docs (for dev purpose)
146 -> TermType Lang -> FileFormat -> FilePath
147 -> m CorpusId
148 flowCorpusFile u n l la ff fp = do
149 docs <- liftIO ( splitEvery 500
150 <$> take l
151 <$> parseFile ff fp
152 )
153 flowCorpus u n la (map (map toHyperdataDocument) docs)
154
155 -- TODO query with complex query
156 flowCorpusSearchInDatabase :: FlowCmdM env err m
157 => Username
158 -> Lang
159 -> Text
160 -> m CorpusId
161 flowCorpusSearchInDatabase u la q = do
162 (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus
163 userMaster
164 (Left "")
165 (Nothing :: Maybe HyperdataCorpus)
166 ids <- map fst <$> searchInDatabase cId (stemIt q)
167 flowCorpusUser la u (Left q) (Nothing :: Maybe HyperdataCorpus) ids
168
169
170 -- UNUSED
171 _flowCorpusSearchInDatabaseApi :: FlowCmdM env err m
172 => Username
173 -> Lang
174 -> Text
175 -> m CorpusId
176 _flowCorpusSearchInDatabaseApi u la q = do
177 (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus
178 userMaster
179 (Left "")
180 (Nothing :: Maybe HyperdataCorpus)
181 ids <- map fst <$> searchInDatabase cId (stemIt q)
182 flowCorpusUser la u (Left q) (Nothing :: Maybe HyperdataCorpus) ids
183
184 ------------------------------------------------------------------------
185 -- | TODO improve the needed type to create/update a corpus
186 {- UNUSED
187 data UserInfo = Username Text
188 | UserId NodeId
189 data CorpusInfo = CorpusName Lang Text
190 | CorpusId Lang NodeId
191 -}
192
193 flow :: (FlowCmdM env err m, FlowCorpus a, MkCorpus c)
194 => Maybe c
195 -> Username
196 -> Either CorpusName [CorpusId]
197 -> TermType Lang
198 -> [[a]]
199 -> m CorpusId
200 flow c u cn la docs = do
201 ids <- mapM (insertMasterDocs c la ) docs
202 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
203
204 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
205 => Username
206 -> Either CorpusName [CorpusId]
207 -> TermType Lang
208 -> [[a]]
209 -> m CorpusId
210 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
211
212 ------------------------------------------------------------------------
213 flowCorpusUser :: (FlowCmdM env err m, MkCorpus c)
214 => Lang
215 -> Username
216 -> Either CorpusName [CorpusId]
217 -> Maybe c
218 -> [NodeId]
219 -> m CorpusId
220 flowCorpusUser l userName corpusName ctype ids = do
221 -- User Flow
222 (userId, _rootId, userCorpusId) <- getOrMkRootWithCorpus userName corpusName ctype
223 listId <- getOrMkList userCorpusId userId
224 -- TODO: check if present already, ignore
225 _ <- Doc.add userCorpusId ids
226 tId <- mkNode NodeTexts userCorpusId userId
227
228 printDebug "Node Text Id" tId
229
230 -- User List Flow
231 --{-
232 (_masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster (Left "") ctype
233 ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
234 userListId <- flowList listId ngs
235 --mastListId <- getOrMkList masterCorpusId masterUserId
236 -- _ <- insertOccsUpdates userCorpusId mastListId
237 printDebug "userListId" userListId
238 -- User Graph Flow
239 _ <- mkDashboard userCorpusId userId
240 _ <- mkGraph userCorpusId userId
241 --_ <- mkPhylo userCorpusId userId
242 --}
243
244
245 -- Annuaire Flow
246 -- _ <- mkAnnuaire rootUserId userId
247 pure userCorpusId
248
249
250 insertMasterDocs :: ( FlowCmdM env err m
251 , FlowCorpus a
252 , MkCorpus c
253 )
254 => Maybe c
255 -> TermType Lang
256 -> [a]
257 -> m [DocId]
258 insertMasterDocs c lang hs = do
259 (masterUserId, _, masterCorpusId) <- getOrMkRootWithCorpus userMaster (Left corpusMasterName) c
260
261 -- TODO Type NodeDocumentUnicised
262 let hs' = map addUniqId hs
263 ids <- insertDb masterUserId masterCorpusId hs'
264 let documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' hs')
265
266 let
267 fixLang (Unsupervised l n s m) = Unsupervised l n s m'
268 where
269 m' = case m of
270 Nothing -> trace ("buildTries here" :: String)
271 $ Just
272 $ buildTries n ( fmap toToken $ uniText
273 $ Text.intercalate " . "
274 $ List.concat
275 $ map hasText documentsWithId
276 )
277 just_m -> just_m
278 fixLang l = l
279
280 lang' = fixLang lang
281 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
282 maps <- mapNodeIdNgrams <$> documentIdWithNgrams (extractNgramsT lang') documentsWithId
283 terms2id <- insertNgrams $ Map.keys maps
284 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) maps
285
286 lId <- getOrMkList masterCorpusId masterUserId
287 _ <- insertDocNgrams lId indexedNgrams
288
289 pure $ map reId ids
290
291
292 type CorpusName = Text
293
294
295 getOrMkRoot :: (HasNodeError err)
296 => Username
297 -> Cmd err (UserId, RootId)
298 getOrMkRoot username = do
299 maybeUserId <- getUser username
300 userId <- case maybeUserId of
301 Nothing -> nodeError NoUserFound
302 Just user -> pure $ userLight_id user
303
304 rootId' <- map _node_id <$> getRoot username
305
306 rootId'' <- case rootId' of
307 [] -> mkRoot username userId
308 n -> case length n >= 2 of
309 True -> nodeError ManyNodeUsers
310 False -> pure rootId'
311
312 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
313 pure (userId, rootId)
314
315
316 getOrMkRootWithCorpus :: (HasNodeError err, MkCorpus a)
317 => Username
318 -> Either CorpusName [CorpusId]
319 -> Maybe a
320 -> Cmd err (UserId, RootId, CorpusId)
321 getOrMkRootWithCorpus username cName c = do
322 (userId, rootId) <- getOrMkRoot username
323 corpusId'' <- if username == userMaster
324 then do
325 ns <- getCorporaWithParentId rootId
326 pure $ map _node_id ns
327 else
328 pure $ fromRight [] cName
329
330 corpusId' <- if corpusId'' /= []
331 then pure corpusId''
332 else mk (Just $ fromLeft "Default" cName) c rootId userId
333
334 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
335
336 pure (userId, rootId, corpusId)
337
338
339 ------------------------------------------------------------------------
340 viewUniqId' :: UniqId a
341 => a
342 -> (HashId, a)
343 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
344 where
345 err = panic "[ERROR] Database.Flow.toInsert"
346
347
348 toInserted :: [ReturnId]
349 -> Map HashId ReturnId
350 toInserted =
351 Map.fromList . map (\r -> (reUniqId r, r) )
352 . filter (\r -> reInserted r == True)
353
354 data DocumentWithId a = DocumentWithId
355 { documentId :: !NodeId
356 , documentData :: !a
357 } deriving (Show)
358
359 instance HasText a => HasText (DocumentWithId a)
360 where
361 hasText (DocumentWithId _ a) = hasText a
362
363 mergeData :: Map HashId ReturnId
364 -> Map HashId a
365 -> [DocumentWithId a]
366 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
367 where
368 toDocumentWithId (sha,hpd) =
369 DocumentWithId <$> fmap reId (lookup sha rs)
370 <*> Just hpd
371
372 ------------------------------------------------------------------------
373 data DocumentIdWithNgrams a = DocumentIdWithNgrams
374 { documentWithId :: !(DocumentWithId a)
375 , document_ngrams :: !(Map Ngrams (Map NgramsType Int))
376 } deriving (Show)
377
378
379 instance HasText HyperdataContact
380 where
381 hasText = undefined
382
383 instance ExtractNgramsT HyperdataContact
384 where
385 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
386 where
387 extract :: TermType Lang -> HyperdataContact
388 -> Cmd err (Map Ngrams (Map NgramsType Int))
389 extract _l hc' = do
390 let authors = map text2ngrams
391 $ maybe ["Nothing"] (\a -> [a])
392 $ view (hc_who . _Just . cw_lastName) hc'
393
394 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
395
396 instance HasText HyperdataDocument
397 where
398 hasText h = catMaybes [ _hyperdataDocument_title h
399 , _hyperdataDocument_abstract h
400 ]
401
402 instance ExtractNgramsT HyperdataDocument
403 where
404 extractNgramsT :: TermType Lang
405 -> HyperdataDocument
406 -> Cmd err (Map Ngrams (Map NgramsType Int))
407 extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
408 where
409 extractNgramsT' :: TermType Lang
410 -> HyperdataDocument
411 -> Cmd err (Map Ngrams (Map NgramsType Int))
412 extractNgramsT' lang' doc = do
413 let source = text2ngrams
414 $ maybe "Nothing" identity
415 $ _hyperdataDocument_source doc
416
417 institutes = map text2ngrams
418 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
419 $ _hyperdataDocument_institutes doc
420
421 authors = map text2ngrams
422 $ maybe ["Nothing"] (splitOn ", ")
423 $ _hyperdataDocument_authors doc
424
425 terms' <- map text2ngrams
426 <$> map (intercalate " " . _terms_label)
427 <$> concat
428 <$> liftIO (extractTerms lang' $ hasText doc)
429
430 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
431 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
432 <> [(a', Map.singleton Authors 1) | a' <- authors ]
433 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
434
435 filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
436 -> Map Ngrams (Map NgramsType Int)
437 filterNgramsT s ms = Map.fromList $ map (\a -> filter' s a) $ Map.toList ms
438 where
439 filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
440 True -> (ng,y)
441 False -> (Ngrams (Text.take s' t) n , y)
442
443
444 documentIdWithNgrams :: HasNodeError err
445 => (a
446 -> Cmd err (Map Ngrams (Map NgramsType Int)))
447 -> [DocumentWithId a]
448 -> Cmd err [DocumentIdWithNgrams a]
449 documentIdWithNgrams f = mapM toDocumentIdWithNgrams
450 where
451 toDocumentIdWithNgrams d = do
452 e <- f $ documentData d
453 pure $ DocumentIdWithNgrams d e
454
455
456 -- FLOW LIST
457 -- | TODO check optimization
458 mapNodeIdNgrams :: [DocumentIdWithNgrams a]
459 -> Map Ngrams (Map NgramsType (Map NodeId Int))
460 mapNodeIdNgrams = Map.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
461 where
462 f :: DocumentIdWithNgrams a
463 -> Map Ngrams (Map NgramsType (Map NodeId Int))
464 f d = fmap (fmap (Map.singleton nId)) $ document_ngrams d
465 where
466 nId = documentId $ documentWithId d
467
468 ------------------------------------------------------------------------
469 listInsert :: FlowCmdM env err m
470 => ListId
471 -> Map NgramsType [NgramsElement]
472 -> m ()
473 listInsert lId ngs = mapM_ (\(typeList, ngElmts)
474 -> putListNgrams lId typeList ngElmts
475 ) $ toList ngs
476
477 flowList :: FlowCmdM env err m
478 => ListId
479 -> Map NgramsType [NgramsElement]
480 -> m ListId
481 flowList lId ngs = do
482 printDebug "listId flowList" lId
483 listInsert lId ngs
484 --trace (show $ List.filter (\n -> _ne_ngrams n == "versatile") $ List.concat $ Map.elems ngs) $ listInsert lId ngs
485 pure lId
486