]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Flow.hs
[DB|WIP] fix Tree RootId
[gargantext.git] / src / Gargantext / Database / Action / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -- TODO-ACCESS:
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
13
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
16 -}
17
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
19
20 {-# LANGUAGE ConstraintKinds #-}
21 {-# LANGUAGE RankNTypes #-}
22 {-# LANGUAGE ConstrainedClassMethods #-}
23 {-# LANGUAGE ConstraintKinds #-}
24 {-# LANGUAGE DeriveGeneric #-}
25 {-# LANGUAGE FlexibleContexts #-}
26 {-# LANGUAGE InstanceSigs #-}
27 {-# LANGUAGE NoImplicitPrelude #-}
28 {-# LANGUAGE OverloadedStrings #-}
29
30 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
31 ( FlowCmdM
32 , flowCorpusFile
33 , flowCorpus
34 , flowCorpusSearchInDatabase
35 , getOrMkRoot
36 , getOrMk_RootWithCorpus
37 , flowAnnuaire
38 )
39 where
40
41 import Control.Lens ((^.), view, _Just)
42 import Data.Either
43 import Data.List (concat)
44 import Data.Map (Map, lookup)
45 import Data.Maybe (Maybe(..), catMaybes)
46 import Data.Monoid
47 import Data.Text (Text, splitOn, intercalate)
48 import Data.Traversable (traverse)
49 import Data.Tuple.Extra (first, second)
50 import Debug.Trace (trace)
51 import Gargantext.Core (Lang(..))
52 import Gargantext.Core.Flow.Types
53 import Gargantext.Core.Types (NodePoly(..), Terms(..))
54 import Gargantext.Core.Types.Individu (User(..))
55 import Gargantext.Core.Types.Main
56 import Gargantext.Database.Action.Flow.List
57 import Gargantext.Database.Action.Flow.Types
58 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams, getUserId)
59 import Gargantext.Database.Action.Query.Node
60 import Gargantext.Database.Action.Query.User
61 import Gargantext.Database.Action.Query.Node.Contact -- (HyperdataContact(..), ContactWho(..))
62 import Gargantext.Database.Action.Query.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
63 import Gargantext.Database.Action.Query.Tree.Root (getRoot)
64 import Gargantext.Database.Action.Query.Tree (mkRoot)
65 import Gargantext.Database.Action.Search (searchInDatabase)
66 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
67 import Gargantext.Database.Admin.Types.Errors (HasNodeError(..), NodeError(..), nodeError)
68 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
69 import Gargantext.Database.Admin.Utils (Cmd)
70 import Gargantext.Database.Schema.Ngrams -- (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
71 import Gargantext.Database.Schema.Node -- (mkRoot, mkCorpus, getOrMkList, mkGraph, {-mkPhylo,-} mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
72 import Gargantext.Database.Schema.NodeNgrams (listInsertDb , getCgramsId)
73 import Gargantext.Database.Schema.NodeNodeNgrams2 -- (NodeNodeNgrams2, insertNodeNodeNgrams2)
74 import Gargantext.Ext.IMT (toSchoolName)
75 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
76 import Gargantext.Prelude
77 import Gargantext.Prelude.Utils hiding (sha)
78 import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat)
79 import Gargantext.Text.List (buildNgramsLists,StopSize(..))
80 import Gargantext.Text.Terms (TermType(..), tt_lang, extractTerms, uniText)
81 import Gargantext.Text.Terms.Eleve (buildTries, toToken)
82 import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
83 import Prelude (String)
84 import System.FilePath (FilePath)
85 import qualified Data.List as List
86 import qualified Data.Map as Map
87 import qualified Data.Text as Text
88 import qualified Gargantext.Database.Action.Query.Node.Document.Add as Doc (add)
89 import qualified Gargantext.Text.Corpus.API.Isidore as Isidore
90 import qualified Gargantext.Text.Corpus.Parsers.GrandDebat as GD
91
92 ------------------------------------------------------------------------
93
94 data ApiQuery = ApiIsidoreQuery Text | ApiIsidoreAuth Text
95 -- | APIs
96 -- TODO instances
97 getDataApi :: Lang
98 -> Maybe Limit
99 -> ApiQuery
100 -> IO [HyperdataDocument]
101 getDataApi lang limit (ApiIsidoreQuery q) = Isidore.get lang limit (Just q) Nothing
102 getDataApi lang limit (ApiIsidoreAuth q) = Isidore.get lang limit Nothing (Just q)
103
104
105 -- UNUSED
106 _flowCorpusApi :: ( FlowCmdM env err m)
107 => User -> Either CorpusName [CorpusId]
108 -> TermType Lang
109 -> Maybe Limit
110 -> ApiQuery
111 -> m CorpusId
112 _flowCorpusApi u n tt l q = do
113 docs <- liftBase $ splitEvery 500 <$> getDataApi (_tt_lang tt) l q
114 flowCorpus u n tt docs
115
116 ------------------------------------------------------------------------
117
118 flowAnnuaire :: FlowCmdM env err m
119 => User
120 -> Either CorpusName [CorpusId]
121 -> (TermType Lang)
122 -> FilePath
123 -> m AnnuaireId
124 flowAnnuaire u n l filePath = do
125 docs <- liftBase $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
126 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
127
128 -- UNUSED
129 _flowCorpusDebat :: FlowCmdM env err m
130 => User -> Either CorpusName [CorpusId]
131 -> Limit -> FilePath
132 -> m CorpusId
133 _flowCorpusDebat u n l fp = do
134 docs <- liftBase ( splitEvery 500
135 <$> take l
136 <$> readFile' fp
137 :: IO [[GD.GrandDebatReference ]]
138 )
139 flowCorpus u n (Multi FR) (map (map toHyperdataDocument) docs)
140
141 flowCorpusFile :: FlowCmdM env err m
142 => User -> Either CorpusName [CorpusId]
143 -> Limit -- Limit the number of docs (for dev purpose)
144 -> TermType Lang -> FileFormat -> FilePath
145 -> m CorpusId
146 flowCorpusFile u n l la ff fp = do
147 docs <- liftBase ( splitEvery 500
148 <$> take l
149 <$> parseFile ff fp
150 )
151 flowCorpus u n la (map (map toHyperdataDocument) docs)
152
153 -- TODO query with complex query
154 flowCorpusSearchInDatabase :: FlowCmdM env err m
155 => User
156 -> Lang
157 -> Text
158 -> m CorpusId
159 flowCorpusSearchInDatabase u la q = do
160 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
161 (UserName userMaster)
162 (Left "")
163 (Nothing :: Maybe HyperdataCorpus)
164 ids <- map fst <$> searchInDatabase cId (stemIt q)
165 flowCorpusUser la u (Left q) (Nothing :: Maybe HyperdataCorpus) ids
166
167
168 -- UNUSED
169 _flowCorpusSearchInDatabaseApi :: FlowCmdM env err m
170 => User
171 -> Lang
172 -> Text
173 -> m CorpusId
174 _flowCorpusSearchInDatabaseApi u la q = do
175 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
176 (UserName userMaster)
177 (Left "")
178 (Nothing :: Maybe HyperdataCorpus)
179 ids <- map fst <$> searchInDatabase cId (stemIt q)
180 flowCorpusUser la u (Left q) (Nothing :: Maybe HyperdataCorpus) ids
181
182 ------------------------------------------------------------------------
183 -- | TODO improve the needed type to create/update a corpus
184 {- UNUSED
185 data UserInfo = Username Text
186 | UserId NodeId
187 data CorpusInfo = CorpusName Lang Text
188 | CorpusId Lang NodeId
189 -}
190
191 flow :: (FlowCmdM env err m, FlowCorpus a, MkCorpus c)
192 => Maybe c
193 -> User
194 -> Either CorpusName [CorpusId]
195 -> TermType Lang
196 -> [[a]]
197 -> m CorpusId
198 flow c u cn la docs = do
199 ids <- traverse (insertMasterDocs c la ) docs
200 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
201
202 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
203 => User
204 -> Either CorpusName [CorpusId]
205 -> TermType Lang
206 -> [[a]]
207 -> m CorpusId
208 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
209
210 ------------------------------------------------------------------------
211 flowCorpusUser :: (FlowCmdM env err m, MkCorpus c)
212 => Lang
213 -> User
214 -> Either CorpusName [CorpusId]
215 -> Maybe c
216 -> [NodeId]
217 -> m CorpusId
218 flowCorpusUser l user corpusName ctype ids = do
219 -- User Flow
220 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
221 listId <- getOrMkList userCorpusId userId
222 _cooc <- mkNode NodeListCooc listId userId
223 -- TODO: check if present already, ignore
224 _ <- Doc.add userCorpusId ids
225
226 _tId <- mkNode NodeTexts userCorpusId userId
227 -- printDebug "Node Text Id" tId
228
229 -- User List Flow
230 (masterUserId, _masterRootId, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
231 ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
232 _userListId <- flowList_DbRepo listId ngs
233 _mastListId <- getOrMkList masterCorpusId masterUserId
234 -- _ <- insertOccsUpdates userCorpusId mastListId
235 -- printDebug "userListId" userListId
236 -- User Graph Flow
237 _ <- mkDashboard userCorpusId userId
238 _ <- mkGraph userCorpusId userId
239 --_ <- mkPhylo userCorpusId userId
240
241 -- Annuaire Flow
242 -- _ <- mkAnnuaire rootUserId userId
243 pure userCorpusId
244
245
246 insertMasterDocs :: ( FlowCmdM env err m
247 , FlowCorpus a
248 , MkCorpus c
249 )
250 => Maybe c
251 -> TermType Lang
252 -> [a]
253 -> m [DocId]
254 insertMasterDocs c lang hs = do
255 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
256
257 -- TODO Type NodeDocumentUnicised
258 let docs = map addUniqId hs
259 ids <- insertDb masterUserId masterCorpusId docs
260 let
261 ids' = map reId ids
262 documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' docs)
263 -- TODO
264 -- create a corpus with database name (CSV or PubMed)
265 -- add documents to the corpus (create node_node link)
266 -- this will enable global database monitoring
267
268 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
269 maps <- mapNodeIdNgrams
270 <$> documentIdWithNgrams (extractNgramsT $ withLang lang documentsWithId) documentsWithId
271
272 terms2id <- insertNgrams $ Map.keys maps
273 -- to be removed
274 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) maps
275
276 -- new
277 lId <- getOrMkList masterCorpusId masterUserId
278 mapCgramsId <- listInsertDb lId toNodeNgramsW'
279 $ map (first _ngramsTerms . second Map.keys)
280 $ Map.toList maps
281 -- insertDocNgrams
282 _return <- insertNodeNodeNgrams2
283 $ catMaybes [ NodeNodeNgrams2 <$> Just nId
284 <*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms)
285 <*> Just (fromIntegral w :: Double)
286 | (terms, mapNgramsTypes) <- Map.toList maps
287 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
288 , (nId, w) <- Map.toList mapNodeIdWeight
289 ]
290
291 _ <- Doc.add masterCorpusId ids'
292 _cooc <- mkNode NodeListCooc lId masterUserId
293 -- to be removed
294 _ <- insertDocNgrams lId indexedNgrams
295
296 pure ids'
297
298
299 withLang :: HasText a => TermType Lang
300 -> [DocumentWithId a]
301 -> TermType Lang
302 withLang (Unsupervised l n s m) ns = Unsupervised l n s m'
303 where
304 m' = case m of
305 Nothing -> trace ("buildTries here" :: String)
306 $ Just
307 $ buildTries n ( fmap toToken $ uniText
308 $ Text.intercalate " . "
309 $ List.concat
310 $ map hasText ns
311 )
312 just_m -> just_m
313 withLang l _ = l
314
315
316
317 type CorpusName = Text
318
319 getOrMkRoot :: (HasNodeError err)
320 => User
321 -> Cmd err (UserId, RootId)
322 getOrMkRoot user = do
323 userId <- getUserId user
324
325 rootId' <- map _node_id <$> getRoot user
326
327 rootId'' <- case rootId' of
328 [] -> mkRoot user
329 n -> case length n >= 2 of
330 True -> nodeError ManyNodeUsers
331 False -> pure rootId'
332
333 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
334 pure (userId, rootId)
335
336
337 getOrMk_RootWithCorpus :: (HasNodeError err, MkCorpus a)
338 => User
339 -> Either CorpusName [CorpusId]
340 -> Maybe a
341 -> Cmd err (UserId, RootId, CorpusId)
342 getOrMk_RootWithCorpus user cName c = do
343 (userId, rootId) <- getOrMkRoot user
344 corpusId'' <- if user == UserName userMaster
345 then do
346 ns <- getCorporaWithParentId rootId
347 pure $ map _node_id ns
348 else
349 pure $ fromRight [] cName
350
351 corpusId' <- if corpusId'' /= []
352 then pure corpusId''
353 else do
354 c' <- mk (Just $ fromLeft "Default" cName) c rootId userId
355 _tId <- case head c' of
356 Nothing -> pure [0]
357 Just c'' -> mkNode NodeTexts c'' userId
358 pure c'
359
360 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
361 pure (userId, rootId, corpusId)
362
363
364 ------------------------------------------------------------------------
365 viewUniqId' :: UniqId a
366 => a
367 -> (HashId, a)
368 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
369 where
370 err = panic "[ERROR] Database.Flow.toInsert"
371
372
373 toInserted :: [ReturnId]
374 -> Map HashId ReturnId
375 toInserted =
376 Map.fromList . map (\r -> (reUniqId r, r) )
377 . filter (\r -> reInserted r == True)
378
379 mergeData :: Map HashId ReturnId
380 -> Map HashId a
381 -> [DocumentWithId a]
382 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
383 where
384 toDocumentWithId (sha,hpd) =
385 DocumentWithId <$> fmap reId (lookup sha rs)
386 <*> Just hpd
387
388 ------------------------------------------------------------------------
389
390 instance HasText HyperdataContact
391 where
392 hasText = undefined
393
394 instance ExtractNgramsT HyperdataContact
395 where
396 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
397 where
398 extract :: TermType Lang -> HyperdataContact
399 -> Cmd err (Map Ngrams (Map NgramsType Int))
400 extract _l hc' = do
401 let authors = map text2ngrams
402 $ maybe ["Nothing"] (\a -> [a])
403 $ view (hc_who . _Just . cw_lastName) hc'
404
405 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
406
407 instance HasText HyperdataDocument
408 where
409 hasText h = catMaybes [ _hyperdataDocument_title h
410 , _hyperdataDocument_abstract h
411 ]
412
413 instance ExtractNgramsT HyperdataDocument
414 where
415 extractNgramsT :: TermType Lang
416 -> HyperdataDocument
417 -> Cmd err (Map Ngrams (Map NgramsType Int))
418 extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
419 where
420 extractNgramsT' :: TermType Lang
421 -> HyperdataDocument
422 -> Cmd err (Map Ngrams (Map NgramsType Int))
423 extractNgramsT' lang' doc = do
424 let source = text2ngrams
425 $ maybe "Nothing" identity
426 $ _hyperdataDocument_source doc
427
428 institutes = map text2ngrams
429 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
430 $ _hyperdataDocument_institutes doc
431
432 authors = map text2ngrams
433 $ maybe ["Nothing"] (splitOn ", ")
434 $ _hyperdataDocument_authors doc
435
436 terms' <- map text2ngrams
437 <$> map (intercalate " " . _terms_label)
438 <$> concat
439 <$> liftBase (extractTerms lang' $ hasText doc)
440
441 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
442 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
443 <> [(a', Map.singleton Authors 1) | a' <- authors ]
444 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
445
446 filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
447 -> Map Ngrams (Map NgramsType Int)
448 filterNgramsT s ms = Map.fromList $ map (\a -> filter' s a) $ Map.toList ms
449 where
450 filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
451 True -> (ng,y)
452 False -> (Ngrams (Text.take s' t) n , y)
453
454
455 documentIdWithNgrams :: HasNodeError err
456 => (a
457 -> Cmd err (Map Ngrams (Map NgramsType Int)))
458 -> [DocumentWithId a]
459 -> Cmd err [DocumentIdWithNgrams a]
460 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
461 where
462 toDocumentIdWithNgrams d = do
463 e <- f $ documentData d
464 pure $ DocumentIdWithNgrams d e
465