]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Flow.hs
[NGRAMS TABLE] refactor tableNgramsPull and expose listNgramsChangedSince
[gargantext.git] / src / Gargantext / Database / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -- TODO-ACCESS:
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
13
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
16 -}
17
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
19
20 {-# LANGUAGE ConstraintKinds #-}
21 {-# LANGUAGE RankNTypes #-}
22 {-# LANGUAGE ConstrainedClassMethods #-}
23 {-# LANGUAGE ConstraintKinds #-}
24 {-# LANGUAGE DeriveGeneric #-}
25 {-# LANGUAGE FlexibleContexts #-}
26 {-# LANGUAGE InstanceSigs #-}
27 {-# LANGUAGE NoImplicitPrelude #-}
28 {-# LANGUAGE OverloadedStrings #-}
29
30 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
31 ( FlowCmdM
32 , flowCorpusFile
33 , flowCorpus
34 , flowCorpusSearchInDatabase
35 , getOrMkRoot
36 , getOrMkRootWithCorpus
37 , flowAnnuaire
38 )
39 where
40 import Prelude (String)
41 import Data.Either
42 import Debug.Trace (trace)
43 import Control.Lens ((^.), view, _Just)
44 import Control.Monad (mapM_)
45 import Control.Monad.IO.Class (liftIO)
46 import Data.List (concat)
47 import Data.Map (Map, lookup, toList)
48 import Data.Maybe (Maybe(..), catMaybes)
49 import Data.Monoid
50 import Data.Text (Text, splitOn, intercalate)
51 import GHC.Show (Show)
52 import Gargantext.API.Ngrams (HasRepoVar)
53 import Gargantext.API.Ngrams (NgramsElement, putListNgrams, RepoCmdM)
54 import Gargantext.Core (Lang(..))
55 import Gargantext.Core.Types (NodePoly(..), Terms(..))
56 import Gargantext.Core.Types.Individu (Username)
57 import Gargantext.Core.Flow
58 import Gargantext.Core.Types.Main
59 import Gargantext.Database.Config (userMaster, corpusMasterName)
60 import Gargantext.Database.Flow.Utils (insertDocNgrams)
61 import Gargantext.Database.Node.Contact -- (HyperdataContact(..), ContactWho(..))
62 import Gargantext.Database.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
63 import Gargantext.Database.Root (getRoot)
64 import Gargantext.Database.Schema.Ngrams -- (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
65 import Gargantext.Database.Schema.Node -- (mkRoot, mkCorpus, getOrMkList, mkGraph, {-mkPhylo,-} mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
66 import Gargantext.Database.Schema.User (getUser, UserLight(..))
67 import Gargantext.Database.TextSearch (searchInDatabase)
68 import Gargantext.Database.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
69 import Gargantext.Database.Utils (Cmd, CmdM)
70 import Gargantext.Ext.IMT (toSchoolName)
71 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
72 import Gargantext.Prelude
73 import Gargantext.Text.Terms.Eleve (buildTries, toToken)
74 import Gargantext.Text.List (buildNgramsLists,StopSize(..))
75 import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat)
76 import qualified Gargantext.Text.Corpus.API.Isidore as Isidore
77 import Gargantext.Text.Terms (TermType(..), tt_lang, extractTerms, uniText)
78 import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
79 import Gargantext.Prelude.Utils hiding (sha)
80 import System.FilePath (FilePath)
81 import qualified Data.List as List
82 import qualified Data.Map as Map
83 import qualified Data.Text as Text
84 import qualified Gargantext.Database.Node.Document.Add as Doc (add)
85 import qualified Gargantext.Text.Corpus.Parsers.GrandDebat as GD
86
87 type FlowCmdM env err m =
88 ( CmdM env err m
89 , RepoCmdM env err m
90 , HasNodeError err
91 , HasRepoVar env
92 )
93
94 ------------------------------------------------------------------------
95
96 data ApiQuery = ApiIsidoreQuery Text | ApiIsidoreAuth Text
97 -- | APIs
98 -- TODO instances
99 getDataApi :: Lang
100 -> Maybe Limit
101 -> ApiQuery
102 -> IO [HyperdataDocument]
103 getDataApi lang limit (ApiIsidoreQuery q) = Isidore.get lang limit (Just q) Nothing
104 getDataApi lang limit (ApiIsidoreAuth q) = Isidore.get lang limit Nothing (Just q)
105
106
107 -- UNUSED
108 _flowCorpusApi :: ( FlowCmdM env err m)
109 => Username -> Either CorpusName [CorpusId]
110 -> TermType Lang
111 -> Maybe Limit
112 -> ApiQuery
113 -> m CorpusId
114 _flowCorpusApi u n tt l q = do
115 docs <- liftIO $ splitEvery 500 <$> getDataApi (_tt_lang tt) l q
116 flowCorpus u n tt docs
117
118 ------------------------------------------------------------------------
119
120 flowAnnuaire :: FlowCmdM env err m
121 => Username
122 -> Either CorpusName [CorpusId]
123 -> (TermType Lang)
124 -> FilePath
125 -> m AnnuaireId
126 flowAnnuaire u n l filePath = do
127 docs <- liftIO $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
128 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
129
130 -- UNUSED
131 _flowCorpusDebat :: FlowCmdM env err m
132 => Username -> Either CorpusName [CorpusId]
133 -> Limit -> FilePath
134 -> m CorpusId
135 _flowCorpusDebat u n l fp = do
136 docs <- liftIO ( splitEvery 500
137 <$> take l
138 <$> readFile' fp
139 :: IO [[GD.GrandDebatReference ]]
140 )
141 flowCorpus u n (Multi FR) (map (map toHyperdataDocument) docs)
142
143 flowCorpusFile :: FlowCmdM env err m
144 => Username -> Either CorpusName [CorpusId]
145 -> Limit -- Limit the number of docs (for dev purpose)
146 -> TermType Lang -> FileFormat -> FilePath
147 -> m CorpusId
148 flowCorpusFile u n l la ff fp = do
149 docs <- liftIO ( splitEvery 500
150 <$> take l
151 <$> parseFile ff fp
152 )
153 flowCorpus u n la (map (map toHyperdataDocument) docs)
154
155 -- TODO query with complex query
156 flowCorpusSearchInDatabase :: FlowCmdM env err m
157 => Username
158 -> Lang
159 -> Text
160 -> m CorpusId
161 flowCorpusSearchInDatabase u la q = do
162 (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus
163 userMaster
164 (Left "")
165 (Nothing :: Maybe HyperdataCorpus)
166 ids <- map fst <$> searchInDatabase cId (stemIt q)
167 flowCorpusUser la u (Left q) (Nothing :: Maybe HyperdataCorpus) ids
168
169
170 -- UNUSED
171 _flowCorpusSearchInDatabaseApi :: FlowCmdM env err m
172 => Username
173 -> Lang
174 -> Text
175 -> m CorpusId
176 _flowCorpusSearchInDatabaseApi u la q = do
177 (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus
178 userMaster
179 (Left "")
180 (Nothing :: Maybe HyperdataCorpus)
181 ids <- map fst <$> searchInDatabase cId (stemIt q)
182 flowCorpusUser la u (Left q) (Nothing :: Maybe HyperdataCorpus) ids
183
184 ------------------------------------------------------------------------
185 -- | TODO improve the needed type to create/update a corpus
186 {- UNUSED
187 data UserInfo = Username Text
188 | UserId NodeId
189 data CorpusInfo = CorpusName Lang Text
190 | CorpusId Lang NodeId
191 -}
192
193 flow :: (FlowCmdM env err m, FlowCorpus a, MkCorpus c)
194 => Maybe c
195 -> Username
196 -> Either CorpusName [CorpusId]
197 -> TermType Lang
198 -> [[a]]
199 -> m CorpusId
200 flow c u cn la docs = do
201 ids <- mapM (insertMasterDocs c la ) docs
202 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
203
204 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
205 => Username
206 -> Either CorpusName [CorpusId]
207 -> TermType Lang
208 -> [[a]]
209 -> m CorpusId
210 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
211
212 ------------------------------------------------------------------------
213 flowCorpusUser :: (FlowCmdM env err m, MkCorpus c)
214 => Lang
215 -> Username
216 -> Either CorpusName [CorpusId]
217 -> Maybe c
218 -> [NodeId]
219 -> m CorpusId
220 flowCorpusUser l userName corpusName ctype ids = do
221 -- User Flow
222 (userId, _rootId, userCorpusId) <- getOrMkRootWithCorpus userName corpusName ctype
223 -- TODO: check if present already, ignore
224 _ <- Doc.add userCorpusId ids
225 tId <- mkNode NodeTexts userCorpusId userId
226
227 printDebug "Node Text Id" tId
228
229 -- User List Flow
230 --{-
231 (_masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster (Left "") ctype
232 ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
233 userListId <- flowList userId userCorpusId ngs
234 printDebug "userListId" userListId
235 -- User Graph Flow
236 _ <- mkDashboard userCorpusId userId
237 _ <- mkGraph userCorpusId userId
238 --_ <- mkPhylo userCorpusId userId
239 --}
240
241
242 -- Annuaire Flow
243 -- _ <- mkAnnuaire rootUserId userId
244 pure userCorpusId
245
246
247 insertMasterDocs :: ( FlowCmdM env err m
248 , FlowCorpus a
249 , MkCorpus c
250 )
251 => Maybe c
252 -> TermType Lang
253 -> [a]
254 -> m [DocId]
255 insertMasterDocs c lang hs = do
256 (masterUserId, _, masterCorpusId) <- getOrMkRootWithCorpus userMaster (Left corpusMasterName) c
257
258 -- TODO Type NodeDocumentUnicised
259 let hs' = map addUniqId hs
260 ids <- insertDb masterUserId masterCorpusId hs'
261 let documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' hs')
262
263 let
264 fixLang (Unsupervised l n s m) = Unsupervised l n s m'
265 where
266 m' = case m of
267 Nothing -> trace ("buildTries here" :: String)
268 $ Just
269 $ buildTries n ( fmap toToken $ uniText
270 $ Text.intercalate " . "
271 $ List.concat
272 $ map hasText documentsWithId
273 )
274 just_m -> just_m
275 fixLang l = l
276
277 lang' = fixLang lang
278 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
279 maps <- mapNodeIdNgrams <$> documentIdWithNgrams (extractNgramsT lang') documentsWithId
280 terms2id <- insertNgrams $ Map.keys maps
281 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) maps
282
283 lId <- getOrMkList masterCorpusId masterUserId
284 _ <- insertDocNgrams lId indexedNgrams
285
286 pure $ map reId ids
287
288
289 type CorpusName = Text
290
291
292 getOrMkRoot :: (HasNodeError err)
293 => Username
294 -> Cmd err (UserId, RootId)
295 getOrMkRoot username = do
296 maybeUserId <- getUser username
297 userId <- case maybeUserId of
298 Nothing -> nodeError NoUserFound
299 Just user -> pure $ userLight_id user
300
301 rootId' <- map _node_id <$> getRoot username
302
303 rootId'' <- case rootId' of
304 [] -> mkRoot username userId
305 n -> case length n >= 2 of
306 True -> nodeError ManyNodeUsers
307 False -> pure rootId'
308
309 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
310 pure (userId, rootId)
311
312
313 getOrMkRootWithCorpus :: (HasNodeError err, MkCorpus a)
314 => Username
315 -> Either CorpusName [CorpusId]
316 -> Maybe a
317 -> Cmd err (UserId, RootId, CorpusId)
318 getOrMkRootWithCorpus username cName c = do
319 (userId, rootId) <- getOrMkRoot username
320 corpusId'' <- if username == userMaster
321 then do
322 ns <- getCorporaWithParentId rootId
323 pure $ map _node_id ns
324 else
325 pure $ fromRight [] cName
326
327 corpusId' <- if corpusId'' /= []
328 then pure corpusId''
329 else mk (Just $ fromLeft "Default" cName) c rootId userId
330
331 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
332
333 pure (userId, rootId, corpusId)
334
335
336 ------------------------------------------------------------------------
337 viewUniqId' :: UniqId a
338 => a
339 -> (HashId, a)
340 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
341 where
342 err = panic "[ERROR] Database.Flow.toInsert"
343
344
345 toInserted :: [ReturnId]
346 -> Map HashId ReturnId
347 toInserted =
348 Map.fromList . map (\r -> (reUniqId r, r) )
349 . filter (\r -> reInserted r == True)
350
351 data DocumentWithId a = DocumentWithId
352 { documentId :: !NodeId
353 , documentData :: !a
354 } deriving (Show)
355
356 instance HasText a => HasText (DocumentWithId a)
357 where
358 hasText (DocumentWithId _ a) = hasText a
359
360 mergeData :: Map HashId ReturnId
361 -> Map HashId a
362 -> [DocumentWithId a]
363 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
364 where
365 toDocumentWithId (sha,hpd) =
366 DocumentWithId <$> fmap reId (lookup sha rs)
367 <*> Just hpd
368
369 ------------------------------------------------------------------------
370 data DocumentIdWithNgrams a = DocumentIdWithNgrams
371 { documentWithId :: !(DocumentWithId a)
372 , document_ngrams :: !(Map Ngrams (Map NgramsType Int))
373 } deriving (Show)
374
375
376 instance HasText HyperdataContact
377 where
378 hasText = undefined
379
380 instance ExtractNgramsT HyperdataContact
381 where
382 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
383 where
384 extract :: TermType Lang -> HyperdataContact
385 -> Cmd err (Map Ngrams (Map NgramsType Int))
386 extract _l hc' = do
387 let authors = map text2ngrams
388 $ maybe ["Nothing"] (\a -> [a])
389 $ view (hc_who . _Just . cw_lastName) hc'
390
391 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
392
393 instance HasText HyperdataDocument
394 where
395 hasText h = catMaybes [ _hyperdataDocument_title h
396 , _hyperdataDocument_abstract h
397 ]
398
399 instance ExtractNgramsT HyperdataDocument
400 where
401 extractNgramsT :: TermType Lang
402 -> HyperdataDocument
403 -> Cmd err (Map Ngrams (Map NgramsType Int))
404 extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
405 where
406 extractNgramsT' :: TermType Lang
407 -> HyperdataDocument
408 -> Cmd err (Map Ngrams (Map NgramsType Int))
409 extractNgramsT' lang' doc = do
410 let source = text2ngrams
411 $ maybe "Nothing" identity
412 $ _hyperdataDocument_source doc
413
414 institutes = map text2ngrams
415 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
416 $ _hyperdataDocument_institutes doc
417
418 authors = map text2ngrams
419 $ maybe ["Nothing"] (splitOn ", ")
420 $ _hyperdataDocument_authors doc
421
422 terms' <- map text2ngrams
423 <$> map (intercalate " " . _terms_label)
424 <$> concat
425 <$> liftIO (extractTerms lang' $ hasText doc)
426
427 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
428 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
429 <> [(a', Map.singleton Authors 1) | a' <- authors ]
430 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
431
432 filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
433 -> Map Ngrams (Map NgramsType Int)
434 filterNgramsT s ms = Map.fromList $ map (\a -> filter' s a) $ Map.toList ms
435 where
436 filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
437 True -> (ng,y)
438 False -> (Ngrams (Text.take s' t) n , y)
439
440
441 documentIdWithNgrams :: HasNodeError err
442 => (a
443 -> Cmd err (Map Ngrams (Map NgramsType Int)))
444 -> [DocumentWithId a]
445 -> Cmd err [DocumentIdWithNgrams a]
446 documentIdWithNgrams f = mapM toDocumentIdWithNgrams
447 where
448 toDocumentIdWithNgrams d = do
449 e <- f $ documentData d
450 pure $ DocumentIdWithNgrams d e
451
452
453 -- FLOW LIST
454 -- | TODO check optimization
455 mapNodeIdNgrams :: [DocumentIdWithNgrams a]
456 -> Map Ngrams (Map NgramsType (Map NodeId Int))
457 mapNodeIdNgrams = Map.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
458 where
459 f :: DocumentIdWithNgrams a
460 -> Map Ngrams (Map NgramsType (Map NodeId Int))
461 f d = fmap (fmap (Map.singleton nId)) $ document_ngrams d
462 where
463 nId = documentId $ documentWithId d
464
465 ------------------------------------------------------------------------
466 listInsert :: FlowCmdM env err m
467 => ListId
468 -> Map NgramsType [NgramsElement]
469 -> m ()
470 listInsert lId ngs = mapM_ (\(typeList, ngElmts)
471 -> putListNgrams lId typeList ngElmts
472 ) $ toList ngs
473
474 flowList :: FlowCmdM env err m
475 => UserId
476 -> CorpusId
477 -> Map NgramsType [NgramsElement]
478 -> m ListId
479 flowList uId cId ngs = do
480 lId <- getOrMkList cId uId
481 printDebug "listId flowList" lId
482 listInsert lId ngs
483 --trace (show $ List.filter (\n -> _ne_ngrams n == "versatile") $ List.concat $ Map.elems ngs) $ listInsert lId ngs
484 pure lId
485