]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Flow.hs
[WIP/DB] Refactoring (start).
[gargantext.git] / src / Gargantext / Database / Action / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -- TODO-ACCESS:
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
13
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
16 -}
17
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
19
20 {-# LANGUAGE ConstraintKinds #-}
21 {-# LANGUAGE RankNTypes #-}
22 {-# LANGUAGE ConstrainedClassMethods #-}
23 {-# LANGUAGE ConstraintKinds #-}
24 {-# LANGUAGE DeriveGeneric #-}
25 {-# LANGUAGE FlexibleContexts #-}
26 {-# LANGUAGE InstanceSigs #-}
27 {-# LANGUAGE NoImplicitPrelude #-}
28 {-# LANGUAGE OverloadedStrings #-}
29
30 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
31 ( FlowCmdM
32 , flowCorpusFile
33 , flowCorpus
34 , flowCorpusSearchInDatabase
35 , getOrMkRoot
36 , getOrMk_RootWithCorpus
37 , flowAnnuaire
38 )
39 where
40
41 import Control.Lens ((^.), view, _Just)
42 import Data.Either
43 import Data.List (concat)
44 import Data.Map (Map, lookup)
45 import Data.Maybe (Maybe(..), catMaybes)
46 import Data.Monoid
47 import Data.Text (Text, splitOn, intercalate)
48 import Data.Traversable (traverse)
49 import Data.Tuple.Extra (first, second)
50 import Debug.Trace (trace)
51 import Gargantext.Core (Lang(..))
52 import Gargantext.Core.Flow.Types
53 import Gargantext.Core.Types (NodePoly(..), Terms(..))
54 import Gargantext.Core.Types.Individu (User(..))
55 import Gargantext.Core.Types.Main
56 import Gargantext.Database.Config (userMaster, corpusMasterName)
57 import Gargantext.Database.Flow.List
58 import Gargantext.Database.Flow.Types
59 import Gargantext.Database.Flow.Utils (insertDocNgrams)
60 import Gargantext.Database.Node.Contact -- (HyperdataContact(..), ContactWho(..))
61 import Gargantext.Database.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
62 import Gargantext.Database.Root (getRoot)
63 import Gargantext.Database.Schema.Ngrams -- (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
64 import Gargantext.Database.Schema.Node -- (mkRoot, mkCorpus, getOrMkList, mkGraph, {-mkPhylo,-} mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
65 import Gargantext.Database.Schema.NodeNgrams (listInsertDb , getCgramsId)
66 import Gargantext.Database.Schema.NodeNodeNgrams2 -- (NodeNodeNgrams2, insertNodeNodeNgrams2)
67 import Gargantext.Database.Schema.User (getUserId)
68 import Gargantext.Database.TextSearch (searchInDatabase)
69 import Gargantext.Database.Types.Errors (HasNodeError(..), NodeError(..), nodeError)
70 import Gargantext.Database.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
71 import Gargantext.Database.Utils (Cmd)
72 import Gargantext.Ext.IMT (toSchoolName)
73 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
74 import Gargantext.Prelude
75 import Gargantext.Prelude.Utils hiding (sha)
76 import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat)
77 import Gargantext.Text.List (buildNgramsLists,StopSize(..))
78 import Gargantext.Text.Terms (TermType(..), tt_lang, extractTerms, uniText)
79 import Gargantext.Text.Terms.Eleve (buildTries, toToken)
80 import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
81 import Prelude (String)
82 import System.FilePath (FilePath)
83 import qualified Data.List as List
84 import qualified Data.Map as Map
85 import qualified Data.Text as Text
86 import qualified Gargantext.Database.Node.Document.Add as Doc (add)
87 import qualified Gargantext.Text.Corpus.API.Isidore as Isidore
88 import qualified Gargantext.Text.Corpus.Parsers.GrandDebat as GD
89
90 ------------------------------------------------------------------------
91
92 data ApiQuery = ApiIsidoreQuery Text | ApiIsidoreAuth Text
93 -- | APIs
94 -- TODO instances
95 getDataApi :: Lang
96 -> Maybe Limit
97 -> ApiQuery
98 -> IO [HyperdataDocument]
99 getDataApi lang limit (ApiIsidoreQuery q) = Isidore.get lang limit (Just q) Nothing
100 getDataApi lang limit (ApiIsidoreAuth q) = Isidore.get lang limit Nothing (Just q)
101
102
103 -- UNUSED
104 _flowCorpusApi :: ( FlowCmdM env err m)
105 => User -> Either CorpusName [CorpusId]
106 -> TermType Lang
107 -> Maybe Limit
108 -> ApiQuery
109 -> m CorpusId
110 _flowCorpusApi u n tt l q = do
111 docs <- liftBase $ splitEvery 500 <$> getDataApi (_tt_lang tt) l q
112 flowCorpus u n tt docs
113
114 ------------------------------------------------------------------------
115
116 flowAnnuaire :: FlowCmdM env err m
117 => User
118 -> Either CorpusName [CorpusId]
119 -> (TermType Lang)
120 -> FilePath
121 -> m AnnuaireId
122 flowAnnuaire u n l filePath = do
123 docs <- liftBase $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
124 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
125
126 -- UNUSED
127 _flowCorpusDebat :: FlowCmdM env err m
128 => User -> Either CorpusName [CorpusId]
129 -> Limit -> FilePath
130 -> m CorpusId
131 _flowCorpusDebat u n l fp = do
132 docs <- liftBase ( splitEvery 500
133 <$> take l
134 <$> readFile' fp
135 :: IO [[GD.GrandDebatReference ]]
136 )
137 flowCorpus u n (Multi FR) (map (map toHyperdataDocument) docs)
138
139 flowCorpusFile :: FlowCmdM env err m
140 => User -> Either CorpusName [CorpusId]
141 -> Limit -- Limit the number of docs (for dev purpose)
142 -> TermType Lang -> FileFormat -> FilePath
143 -> m CorpusId
144 flowCorpusFile u n l la ff fp = do
145 docs <- liftBase ( splitEvery 500
146 <$> take l
147 <$> parseFile ff fp
148 )
149 flowCorpus u n la (map (map toHyperdataDocument) docs)
150
151 -- TODO query with complex query
152 flowCorpusSearchInDatabase :: FlowCmdM env err m
153 => User
154 -> Lang
155 -> Text
156 -> m CorpusId
157 flowCorpusSearchInDatabase u la q = do
158 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
159 (UserName userMaster)
160 (Left "")
161 (Nothing :: Maybe HyperdataCorpus)
162 ids <- map fst <$> searchInDatabase cId (stemIt q)
163 flowCorpusUser la u (Left q) (Nothing :: Maybe HyperdataCorpus) ids
164
165
166 -- UNUSED
167 _flowCorpusSearchInDatabaseApi :: FlowCmdM env err m
168 => User
169 -> Lang
170 -> Text
171 -> m CorpusId
172 _flowCorpusSearchInDatabaseApi u la q = do
173 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
174 (UserName userMaster)
175 (Left "")
176 (Nothing :: Maybe HyperdataCorpus)
177 ids <- map fst <$> searchInDatabase cId (stemIt q)
178 flowCorpusUser la u (Left q) (Nothing :: Maybe HyperdataCorpus) ids
179
180 ------------------------------------------------------------------------
181 -- | TODO improve the needed type to create/update a corpus
182 {- UNUSED
183 data UserInfo = Username Text
184 | UserId NodeId
185 data CorpusInfo = CorpusName Lang Text
186 | CorpusId Lang NodeId
187 -}
188
189 flow :: (FlowCmdM env err m, FlowCorpus a, MkCorpus c)
190 => Maybe c
191 -> User
192 -> Either CorpusName [CorpusId]
193 -> TermType Lang
194 -> [[a]]
195 -> m CorpusId
196 flow c u cn la docs = do
197 ids <- traverse (insertMasterDocs c la ) docs
198 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
199
200 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
201 => User
202 -> Either CorpusName [CorpusId]
203 -> TermType Lang
204 -> [[a]]
205 -> m CorpusId
206 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
207
208 ------------------------------------------------------------------------
209 flowCorpusUser :: (FlowCmdM env err m, MkCorpus c)
210 => Lang
211 -> User
212 -> Either CorpusName [CorpusId]
213 -> Maybe c
214 -> [NodeId]
215 -> m CorpusId
216 flowCorpusUser l user corpusName ctype ids = do
217 -- User Flow
218 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
219 listId <- getOrMkList userCorpusId userId
220 _cooc <- mkNode NodeListCooc listId userId
221 -- TODO: check if present already, ignore
222 _ <- Doc.add userCorpusId ids
223
224 _tId <- mkNode NodeTexts userCorpusId userId
225 -- printDebug "Node Text Id" tId
226
227 -- User List Flow
228 (masterUserId, _masterRootId, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
229 ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
230 _userListId <- flowList_DbRepo listId ngs
231 _mastListId <- getOrMkList masterCorpusId masterUserId
232 -- _ <- insertOccsUpdates userCorpusId mastListId
233 -- printDebug "userListId" userListId
234 -- User Graph Flow
235 _ <- mkDashboard userCorpusId userId
236 _ <- mkGraph userCorpusId userId
237 --_ <- mkPhylo userCorpusId userId
238
239 -- Annuaire Flow
240 -- _ <- mkAnnuaire rootUserId userId
241 pure userCorpusId
242
243
244 insertMasterDocs :: ( FlowCmdM env err m
245 , FlowCorpus a
246 , MkCorpus c
247 )
248 => Maybe c
249 -> TermType Lang
250 -> [a]
251 -> m [DocId]
252 insertMasterDocs c lang hs = do
253 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
254
255 -- TODO Type NodeDocumentUnicised
256 let docs = map addUniqId hs
257 ids <- insertDb masterUserId masterCorpusId docs
258 let
259 ids' = map reId ids
260 documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' docs)
261 -- TODO
262 -- create a corpus with database name (CSV or PubMed)
263 -- add documents to the corpus (create node_node link)
264 -- this will enable global database monitoring
265
266 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
267 maps <- mapNodeIdNgrams
268 <$> documentIdWithNgrams (extractNgramsT $ withLang lang documentsWithId) documentsWithId
269
270 terms2id <- insertNgrams $ Map.keys maps
271 -- to be removed
272 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) maps
273
274 -- new
275 lId <- getOrMkList masterCorpusId masterUserId
276 mapCgramsId <- listInsertDb lId toNodeNgramsW'
277 $ map (first _ngramsTerms . second Map.keys)
278 $ Map.toList maps
279 -- insertDocNgrams
280 _return <- insertNodeNodeNgrams2
281 $ catMaybes [ NodeNodeNgrams2 <$> Just nId
282 <*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms)
283 <*> Just (fromIntegral w :: Double)
284 | (terms, mapNgramsTypes) <- Map.toList maps
285 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
286 , (nId, w) <- Map.toList mapNodeIdWeight
287 ]
288
289 _ <- Doc.add masterCorpusId ids'
290 _cooc <- mkNode NodeListCooc lId masterUserId
291 -- to be removed
292 _ <- insertDocNgrams lId indexedNgrams
293
294 pure ids'
295
296
297 withLang :: HasText a => TermType Lang
298 -> [DocumentWithId a]
299 -> TermType Lang
300 withLang (Unsupervised l n s m) ns = Unsupervised l n s m'
301 where
302 m' = case m of
303 Nothing -> trace ("buildTries here" :: String)
304 $ Just
305 $ buildTries n ( fmap toToken $ uniText
306 $ Text.intercalate " . "
307 $ List.concat
308 $ map hasText ns
309 )
310 just_m -> just_m
311 withLang l _ = l
312
313
314
315 type CorpusName = Text
316
317 getOrMkRoot :: (HasNodeError err)
318 => User
319 -> Cmd err (UserId, RootId)
320 getOrMkRoot user = do
321 userId <- getUserId user
322
323 rootId' <- map _node_id <$> getRoot user
324
325 rootId'' <- case rootId' of
326 [] -> mkRoot user
327 n -> case length n >= 2 of
328 True -> nodeError ManyNodeUsers
329 False -> pure rootId'
330
331 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
332 pure (userId, rootId)
333
334
335 getOrMk_RootWithCorpus :: (HasNodeError err, MkCorpus a)
336 => User
337 -> Either CorpusName [CorpusId]
338 -> Maybe a
339 -> Cmd err (UserId, RootId, CorpusId)
340 getOrMk_RootWithCorpus user cName c = do
341 (userId, rootId) <- getOrMkRoot user
342 corpusId'' <- if user == UserName userMaster
343 then do
344 ns <- getCorporaWithParentId rootId
345 pure $ map _node_id ns
346 else
347 pure $ fromRight [] cName
348
349 corpusId' <- if corpusId'' /= []
350 then pure corpusId''
351 else do
352 c' <- mk (Just $ fromLeft "Default" cName) c rootId userId
353 _tId <- case head c' of
354 Nothing -> pure [0]
355 Just c'' -> mkNode NodeTexts c'' userId
356 pure c'
357
358 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
359 pure (userId, rootId, corpusId)
360
361
362 ------------------------------------------------------------------------
363 viewUniqId' :: UniqId a
364 => a
365 -> (HashId, a)
366 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
367 where
368 err = panic "[ERROR] Database.Flow.toInsert"
369
370
371 toInserted :: [ReturnId]
372 -> Map HashId ReturnId
373 toInserted =
374 Map.fromList . map (\r -> (reUniqId r, r) )
375 . filter (\r -> reInserted r == True)
376
377 mergeData :: Map HashId ReturnId
378 -> Map HashId a
379 -> [DocumentWithId a]
380 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
381 where
382 toDocumentWithId (sha,hpd) =
383 DocumentWithId <$> fmap reId (lookup sha rs)
384 <*> Just hpd
385
386 ------------------------------------------------------------------------
387
388 instance HasText HyperdataContact
389 where
390 hasText = undefined
391
392 instance ExtractNgramsT HyperdataContact
393 where
394 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
395 where
396 extract :: TermType Lang -> HyperdataContact
397 -> Cmd err (Map Ngrams (Map NgramsType Int))
398 extract _l hc' = do
399 let authors = map text2ngrams
400 $ maybe ["Nothing"] (\a -> [a])
401 $ view (hc_who . _Just . cw_lastName) hc'
402
403 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
404
405 instance HasText HyperdataDocument
406 where
407 hasText h = catMaybes [ _hyperdataDocument_title h
408 , _hyperdataDocument_abstract h
409 ]
410
411 instance ExtractNgramsT HyperdataDocument
412 where
413 extractNgramsT :: TermType Lang
414 -> HyperdataDocument
415 -> Cmd err (Map Ngrams (Map NgramsType Int))
416 extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
417 where
418 extractNgramsT' :: TermType Lang
419 -> HyperdataDocument
420 -> Cmd err (Map Ngrams (Map NgramsType Int))
421 extractNgramsT' lang' doc = do
422 let source = text2ngrams
423 $ maybe "Nothing" identity
424 $ _hyperdataDocument_source doc
425
426 institutes = map text2ngrams
427 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
428 $ _hyperdataDocument_institutes doc
429
430 authors = map text2ngrams
431 $ maybe ["Nothing"] (splitOn ", ")
432 $ _hyperdataDocument_authors doc
433
434 terms' <- map text2ngrams
435 <$> map (intercalate " " . _terms_label)
436 <$> concat
437 <$> liftBase (extractTerms lang' $ hasText doc)
438
439 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
440 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
441 <> [(a', Map.singleton Authors 1) | a' <- authors ]
442 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
443
444 filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
445 -> Map Ngrams (Map NgramsType Int)
446 filterNgramsT s ms = Map.fromList $ map (\a -> filter' s a) $ Map.toList ms
447 where
448 filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
449 True -> (ng,y)
450 False -> (Ngrams (Text.take s' t) n , y)
451
452
453 documentIdWithNgrams :: HasNodeError err
454 => (a
455 -> Cmd err (Map Ngrams (Map NgramsType Int)))
456 -> [DocumentWithId a]
457 -> Cmd err [DocumentIdWithNgrams a]
458 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
459 where
460 toDocumentIdWithNgrams d = do
461 e <- f $ documentData d
462 pure $ DocumentIdWithNgrams d e
463