]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Flow.hs
[DB][OPTIM] WIP
[gargantext.git] / src / Gargantext / Database / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -- TODO-ACCESS:
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
13
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
16 -}
17
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
19
20 {-# LANGUAGE ConstraintKinds #-}
21 {-# LANGUAGE RankNTypes #-}
22 {-# LANGUAGE ConstrainedClassMethods #-}
23 {-# LANGUAGE ConstraintKinds #-}
24 {-# LANGUAGE DeriveGeneric #-}
25 {-# LANGUAGE FlexibleContexts #-}
26 {-# LANGUAGE InstanceSigs #-}
27 {-# LANGUAGE NoImplicitPrelude #-}
28 {-# LANGUAGE OverloadedStrings #-}
29
30 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
31 ( FlowCmdM
32 , flowCorpusFile
33 , flowCorpus
34 , flowCorpusSearchInDatabase
35 , getOrMkRoot
36 , getOrMk_RootWithCorpus
37 , flowAnnuaire
38 )
39 where
40
41 import Prelude (String)
42 import Data.Either
43 import Data.Tuple.Extra (first, second)
44 import Data.Traversable (traverse)
45 import Debug.Trace (trace)
46 import Control.Lens ((^.), view, _Just)
47 import Control.Monad.IO.Class (liftIO)
48 import Data.List (concat)
49 import Data.Map (Map, lookup)
50 import Data.Maybe (Maybe(..), catMaybes)
51 import Data.Monoid
52 import Data.Text (Text, splitOn, intercalate)
53 import Gargantext.Core (Lang(..))
54 import Gargantext.Core.Types (NodePoly(..), Terms(..))
55 import Gargantext.Core.Types.Individu (Username)
56 import Gargantext.Core.Flow.Types
57 import Gargantext.Core.Types.Main
58 import Gargantext.Database.Config (userMaster, corpusMasterName)
59 import Gargantext.Database.Flow.Utils (insertDocNgrams)
60 import Gargantext.Database.Flow.List
61 import Gargantext.Database.Flow.Types
62 import Gargantext.Database.Node.Contact -- (HyperdataContact(..), ContactWho(..))
63 import Gargantext.Database.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
64 import Gargantext.Database.Root (getRoot)
65
66 import Gargantext.Database.Schema.Ngrams -- (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
67 import Gargantext.Database.Schema.Node -- (mkRoot, mkCorpus, getOrMkList, mkGraph, {-mkPhylo,-} mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
68 import Gargantext.Database.Schema.NodeNgrams (listInsertDb, getCgramsId)
69 import Gargantext.Database.Schema.NodeNodeNgrams2 -- (NodeNodeNgrams2, insertNodeNodeNgrams2)
70 import Gargantext.Database.Schema.User (getUser, UserLight(..))
71 import Gargantext.Database.TextSearch (searchInDatabase)
72 import Gargantext.Database.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
73 import Gargantext.Database.Utils (Cmd)
74 import Gargantext.Ext.IMT (toSchoolName)
75 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
76 import Gargantext.Prelude
77 import Gargantext.Text.Terms.Eleve (buildTries, toToken)
78 --import Gargantext.Text.List (buildNgramsLists,StopSize(..))
79 import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat)
80 import qualified Gargantext.Text.Corpus.API.Isidore as Isidore
81 import Gargantext.Text.Terms (TermType(..), tt_lang, extractTerms, uniText)
82 import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
83 import Gargantext.Prelude.Utils hiding (sha)
84 import System.FilePath (FilePath)
85 import qualified Data.List as List
86 import qualified Data.Map as Map
87 import qualified Data.Text as Text
88 import qualified Gargantext.Database.Node.Document.Add as Doc (add)
89 import qualified Gargantext.Text.Corpus.Parsers.GrandDebat as GD
90
91 ------------------------------------------------------------------------
92
93 data ApiQuery = ApiIsidoreQuery Text | ApiIsidoreAuth Text
94 -- | APIs
95 -- TODO instances
96 getDataApi :: Lang
97 -> Maybe Limit
98 -> ApiQuery
99 -> IO [HyperdataDocument]
100 getDataApi lang limit (ApiIsidoreQuery q) = Isidore.get lang limit (Just q) Nothing
101 getDataApi lang limit (ApiIsidoreAuth q) = Isidore.get lang limit Nothing (Just q)
102
103
104 -- UNUSED
105 _flowCorpusApi :: ( FlowCmdM env err m)
106 => Username -> Either CorpusName [CorpusId]
107 -> TermType Lang
108 -> Maybe Limit
109 -> ApiQuery
110 -> m CorpusId
111 _flowCorpusApi u n tt l q = do
112 docs <- liftIO $ splitEvery 500 <$> getDataApi (_tt_lang tt) l q
113 flowCorpus u n tt docs
114
115 ------------------------------------------------------------------------
116
117 flowAnnuaire :: FlowCmdM env err m
118 => Username
119 -> Either CorpusName [CorpusId]
120 -> (TermType Lang)
121 -> FilePath
122 -> m AnnuaireId
123 flowAnnuaire u n l filePath = do
124 docs <- liftIO $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
125 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
126
127 -- UNUSED
128 _flowCorpusDebat :: FlowCmdM env err m
129 => Username -> Either CorpusName [CorpusId]
130 -> Limit -> FilePath
131 -> m CorpusId
132 _flowCorpusDebat u n l fp = do
133 docs <- liftIO ( splitEvery 500
134 <$> take l
135 <$> readFile' fp
136 :: IO [[GD.GrandDebatReference ]]
137 )
138 flowCorpus u n (Multi FR) (map (map toHyperdataDocument) docs)
139
140 flowCorpusFile :: FlowCmdM env err m
141 => Username -> Either CorpusName [CorpusId]
142 -> Limit -- Limit the number of docs (for dev purpose)
143 -> TermType Lang -> FileFormat -> FilePath
144 -> m CorpusId
145 flowCorpusFile u n l la ff fp = do
146 docs <- liftIO ( splitEvery 500
147 <$> take l
148 <$> parseFile ff fp
149 )
150 flowCorpus u n la (map (map toHyperdataDocument) docs)
151
152 -- TODO query with complex query
153 flowCorpusSearchInDatabase :: FlowCmdM env err m
154 => Username
155 -> Lang
156 -> Text
157 -> m CorpusId
158 flowCorpusSearchInDatabase u la q = do
159 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
160 userMaster
161 (Left "")
162 (Nothing :: Maybe HyperdataCorpus)
163 ids <- map fst <$> searchInDatabase cId (stemIt q)
164 flowCorpusUser la u (Left q) (Nothing :: Maybe HyperdataCorpus) ids
165
166
167 -- UNUSED
168 _flowCorpusSearchInDatabaseApi :: FlowCmdM env err m
169 => Username
170 -> Lang
171 -> Text
172 -> m CorpusId
173 _flowCorpusSearchInDatabaseApi u la q = do
174 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
175 userMaster
176 (Left "")
177 (Nothing :: Maybe HyperdataCorpus)
178 ids <- map fst <$> searchInDatabase cId (stemIt q)
179 flowCorpusUser la u (Left q) (Nothing :: Maybe HyperdataCorpus) ids
180
181 ------------------------------------------------------------------------
182 -- | TODO improve the needed type to create/update a corpus
183 {- UNUSED
184 data UserInfo = Username Text
185 | UserId NodeId
186 data CorpusInfo = CorpusName Lang Text
187 | CorpusId Lang NodeId
188 -}
189
190 flow :: (FlowCmdM env err m, FlowCorpus a, MkCorpus c)
191 => Maybe c
192 -> Username
193 -> Either CorpusName [CorpusId]
194 -> TermType Lang
195 -> [[a]]
196 -> m CorpusId
197 flow c u cn la docs = do
198 ids <- traverse (insertMasterDocs c la ) docs
199 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
200
201 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
202 => Username
203 -> Either CorpusName [CorpusId]
204 -> TermType Lang
205 -> [[a]]
206 -> m CorpusId
207 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
208
209 ------------------------------------------------------------------------
210 flowCorpusUser :: (FlowCmdM env err m, MkCorpus c)
211 => Lang
212 -> Username
213 -> Either CorpusName [CorpusId]
214 -> Maybe c
215 -> [NodeId]
216 -> m CorpusId
217 flowCorpusUser _l userName corpusName ctype ids = do
218 -- User Flow
219 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus userName corpusName ctype
220 listId <- getOrMkList userCorpusId userId
221 _cooc <- mkNode NodeListCooc listId userId
222 -- TODO: check if present already, ignore
223 _ <- Doc.add userCorpusId ids
224
225 _tId <- mkNode NodeTexts userCorpusId userId
226 -- printDebug "Node Text Id" tId
227
228 -- User List Flow
229 (_masterUserId, _masterRootId, _masterCorpusId) <- getOrMk_RootWithCorpus userMaster (Left "") ctype
230 --ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
231 -- _userListId <- flowList_DbRepo listId ngs
232 --mastListId <- getOrMkList masterCorpusId masterUserId
233 -- _ <- insertOccsUpdates userCorpusId mastListId
234 -- printDebug "userListId" userListId
235 -- User Graph Flow
236 _ <- mkDashboard userCorpusId userId
237 _ <- mkGraph userCorpusId userId
238 --_ <- mkPhylo userCorpusId userId
239
240 -- Annuaire Flow
241 -- _ <- mkAnnuaire rootUserId userId
242 pure userCorpusId
243
244
245 insertMasterDocs :: ( FlowCmdM env err m
246 , FlowCorpus a
247 , MkCorpus c
248 )
249 => Maybe c
250 -> TermType Lang
251 -> [a]
252 -> m [DocId]
253 insertMasterDocs c lang hs = do
254 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus userMaster (Left corpusMasterName) c
255
256 -- TODO Type NodeDocumentUnicised
257 let docs = map addUniqId hs
258 ids <- insertDb masterUserId masterCorpusId docs
259 let
260 ids' = map reId ids
261 documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' docs)
262 _ <- Doc.add masterCorpusId ids'
263 -- TODO
264 -- create a corpus with database name (CSV or PubMed)
265 -- add documents to the corpus (create node_node link)
266 -- this will enable global database monitoring
267
268 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
269 maps <- mapNodeIdNgrams
270 <$> documentIdWithNgrams (extractNgramsT $ withLang lang documentsWithId) documentsWithId
271
272 terms2id <- insertNgrams $ Map.keys maps
273 -- to be removed
274 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) maps
275
276 -- new
277 lId <- getOrMkList masterCorpusId masterUserId
278 mapCgramsId <- listInsertDb lId toNodeNgramsW'
279 $ map (first _ngramsTerms . second Map.keys)
280 $ Map.toList maps
281 -- insertDocNgrams
282 _return <- insertNodeNodeNgrams2
283 $ catMaybes [ NodeNodeNgrams2 <$> Just nId
284 <*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms)
285 <*> Just (fromIntegral w :: Double)
286 | (terms, mapNgramsTypes) <- Map.toList maps
287 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
288 , (nId, w) <- Map.toList mapNodeIdWeight
289 ]
290
291 _cooc <- mkNode NodeListCooc lId masterUserId
292 -- to be removed
293 _ <- insertDocNgrams lId indexedNgrams
294
295 pure ids'
296
297
298 withLang :: HasText a => TermType Lang
299 -> [DocumentWithId a]
300 -> TermType Lang
301 withLang (Unsupervised l n s m) ns = Unsupervised l n s m'
302 where
303 m' = case m of
304 Nothing -> trace ("buildTries here" :: String)
305 $ Just
306 $ buildTries n ( fmap toToken $ uniText
307 $ Text.intercalate " . "
308 $ List.concat
309 $ map hasText ns
310 )
311 just_m -> just_m
312 withLang l _ = l
313
314
315
316 type CorpusName = Text
317
318
319 getOrMkRoot :: (HasNodeError err)
320 => Username
321 -> Cmd err (UserId, RootId)
322 getOrMkRoot username = do
323 maybeUserId <- getUser username
324 userId <- case maybeUserId of
325 Nothing -> nodeError NoUserFound
326 Just user -> pure $ userLight_id user
327
328 rootId' <- map _node_id <$> getRoot username
329
330 rootId'' <- case rootId' of
331 [] -> mkRoot username userId
332 n -> case length n >= 2 of
333 True -> nodeError ManyNodeUsers
334 False -> pure rootId'
335
336 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
337 pure (userId, rootId)
338
339
340 getOrMk_RootWithCorpus :: (HasNodeError err, MkCorpus a)
341 => Username
342 -> Either CorpusName [CorpusId]
343 -> Maybe a
344 -> Cmd err (UserId, RootId, CorpusId)
345 getOrMk_RootWithCorpus username cName c = do
346 (userId, rootId) <- getOrMkRoot username
347 corpusId'' <- if username == userMaster
348 then do
349 ns <- getCorporaWithParentId rootId
350 pure $ map _node_id ns
351 else
352 pure $ fromRight [] cName
353
354 corpusId' <- if corpusId'' /= []
355 then pure corpusId''
356 else do
357 c' <- mk (Just $ fromLeft "Default" cName) c rootId userId
358 _tId <- case head c' of
359 Nothing -> pure [0]
360 Just c'' -> mkNode NodeTexts c'' userId
361 pure c'
362
363 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
364
365 pure (userId, rootId, corpusId)
366
367
368 ------------------------------------------------------------------------
369 viewUniqId' :: UniqId a
370 => a
371 -> (HashId, a)
372 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
373 where
374 err = panic "[ERROR] Database.Flow.toInsert"
375
376
377 toInserted :: [ReturnId]
378 -> Map HashId ReturnId
379 toInserted =
380 Map.fromList . map (\r -> (reUniqId r, r) )
381 . filter (\r -> reInserted r == True)
382
383 mergeData :: Map HashId ReturnId
384 -> Map HashId a
385 -> [DocumentWithId a]
386 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
387 where
388 toDocumentWithId (sha,hpd) =
389 DocumentWithId <$> fmap reId (lookup sha rs)
390 <*> Just hpd
391
392 ------------------------------------------------------------------------
393
394 instance HasText HyperdataContact
395 where
396 hasText = undefined
397
398 instance ExtractNgramsT HyperdataContact
399 where
400 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
401 where
402 extract :: TermType Lang -> HyperdataContact
403 -> Cmd err (Map Ngrams (Map NgramsType Int))
404 extract _l hc' = do
405 let authors = map text2ngrams
406 $ maybe ["Nothing"] (\a -> [a])
407 $ view (hc_who . _Just . cw_lastName) hc'
408
409 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
410
411 instance HasText HyperdataDocument
412 where
413 hasText h = catMaybes [ _hyperdataDocument_title h
414 , _hyperdataDocument_abstract h
415 ]
416
417 instance ExtractNgramsT HyperdataDocument
418 where
419 extractNgramsT :: TermType Lang
420 -> HyperdataDocument
421 -> Cmd err (Map Ngrams (Map NgramsType Int))
422 extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
423 where
424 extractNgramsT' :: TermType Lang
425 -> HyperdataDocument
426 -> Cmd err (Map Ngrams (Map NgramsType Int))
427 extractNgramsT' lang' doc = do
428 let source = text2ngrams
429 $ maybe "Nothing" identity
430 $ _hyperdataDocument_source doc
431
432 institutes = map text2ngrams
433 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
434 $ _hyperdataDocument_institutes doc
435
436 authors = map text2ngrams
437 $ maybe ["Nothing"] (splitOn ", ")
438 $ _hyperdataDocument_authors doc
439
440 terms' <- map text2ngrams
441 <$> map (intercalate " " . _terms_label)
442 <$> concat
443 <$> liftIO (extractTerms lang' $ hasText doc)
444
445 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
446 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
447 <> [(a', Map.singleton Authors 1) | a' <- authors ]
448 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
449
450 filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
451 -> Map Ngrams (Map NgramsType Int)
452 filterNgramsT s ms = Map.fromList $ map (\a -> filter' s a) $ Map.toList ms
453 where
454 filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
455 True -> (ng,y)
456 False -> (Ngrams (Text.take s' t) n , y)
457
458
459 documentIdWithNgrams :: HasNodeError err
460 => (a
461 -> Cmd err (Map Ngrams (Map NgramsType Int)))
462 -> [DocumentWithId a]
463 -> Cmd err [DocumentIdWithNgrams a]
464 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
465 where
466 toDocumentIdWithNgrams d = do
467 e <- f $ documentData d
468 pure $ DocumentIdWithNgrams d e
469