]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Flow.hs
Generalize error type to make less use of ServantErr
[gargantext.git] / src / Gargantext / Database / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10
11 -- TODO-ACCESS:
12 -- check userId CanFillUserCorpus userCorpusId
13 -- check masterUserId CanFillMasterCorpus masterCorpusId
14
15 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
16 -- TODO-EVENTS: InsertedNodes
17
18
19 -}
20
21 {-# LANGUAGE ConstraintKinds #-}
22 {-# LANGUAGE DeriveGeneric #-}
23 {-# LANGUAGE FlexibleContexts #-}
24 {-# LANGUAGE InstanceSigs #-}
25 {-# LANGUAGE NoImplicitPrelude #-}
26 {-# LANGUAGE OverloadedStrings #-}
27 {-# LANGUAGE RankNTypes #-}
28 {-# LANGUAGE ConstrainedClassMethods #-}
29
30 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
31 where
32 import Prelude (String)
33 import Debug.Trace (trace)
34 import Control.Lens ((^.), view, Lens', _Just)
35 import Control.Monad (mapM_)
36 import Control.Monad.IO.Class (liftIO)
37 import Data.List (concat)
38 import Data.Map (Map, lookup, toList)
39 import Data.Maybe (Maybe(..), catMaybes)
40 import Data.Monoid
41 import Data.Text (Text, splitOn, intercalate)
42 import GHC.Show (Show)
43 import Gargantext.API.Ngrams (HasRepoVar)
44 import Gargantext.API.Ngrams (NgramsElement(..), putListNgrams, RepoCmdM)
45 import Gargantext.Core (Lang(..))
46 import Gargantext.Core.Types (NodePoly(..), Terms(..))
47 import Gargantext.Core.Types.Individu (Username)
48 import Gargantext.Core.Types.Main
49 import Gargantext.Database.Config (userMaster, corpusMasterName)
50 import Gargantext.Database.Flow.Utils (insertDocNgrams)
51 import Gargantext.Database.Node.Contact -- (HyperdataContact(..), ContactWho(..))
52 import Gargantext.Database.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
53 import Gargantext.Database.Root (getRoot)
54 import Gargantext.Database.Schema.Ngrams -- (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
55 import Gargantext.Database.Schema.Node -- (mkRoot, mkCorpus, getOrMkList, mkGraph, mkPhylo, mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
56 import Gargantext.Database.Schema.User (getUser, UserLight(..))
57 import Gargantext.Database.TextSearch (searchInDatabase)
58 import Gargantext.Database.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
59 import Gargantext.Database.Utils (Cmd, CmdM)
60 import Gargantext.Ext.IMT (toSchoolName)
61 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
62 import Gargantext.Prelude
63 import Gargantext.Text.Terms.Eleve (buildTries, toToken)
64 import Gargantext.Text.List (buildNgramsLists,StopSize(..))
65 import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat)
66 import qualified Gargantext.Text.Corpus.API.Isidore as Isidore
67 import Gargantext.Text.Terms (TermType(..), tt_lang, extractTerms, uniText)
68 import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
69 import System.FilePath (FilePath)
70 import qualified Data.List as List
71 import qualified Data.Map as Map
72 import qualified Data.Text as Text
73 import qualified Gargantext.Database.Node.Document.Add as Doc (add)
74 import qualified Gargantext.Text.Corpus.Parsers.GrandDebat as GD
75
76 type FlowCmdM env err m =
77 ( CmdM env err m
78 , RepoCmdM env err m
79 , HasNodeError err
80 , HasRepoVar env
81 )
82
83 type FlowCorpus a = ( AddUniqId a
84 , UniqId a
85 , InsertDb a
86 , ExtractNgramsT a
87 , HasText a
88 )
89
90 ------------------------------------------------------------------------
91
92 data ApiQuery = ApiIsidoreQuery Text | ApiIsidoreAuth Text
93 -- | APIs
94 -- TODO instances
95 getDataApi :: Lang
96 -> Maybe Limit
97 -> ApiQuery
98 -> IO [HyperdataDocument]
99 getDataApi lang limit (ApiIsidoreQuery q) = Isidore.get lang limit (Just q) Nothing
100 getDataApi lang limit (ApiIsidoreAuth q) = Isidore.get lang limit Nothing (Just q)
101
102
103 flowCorpusApi :: ( FlowCmdM env err m)
104 => Username -> CorpusName
105 -> TermType Lang
106 -> Maybe Limit
107 -> ApiQuery
108 -> m CorpusId
109 flowCorpusApi u n tt l q = do
110 docs <- liftIO $ splitEvery 500 <$> getDataApi (_tt_lang tt) l q
111 flowCorpus u n tt docs
112
113 ------------------------------------------------------------------------
114
115
116 flowAnnuaire :: FlowCmdM env err m
117 => Username -> CorpusName -> (TermType Lang) -> FilePath -> m AnnuaireId
118 flowAnnuaire u n l filePath = do
119 docs <- liftIO $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
120 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
121
122
123 flowCorpusDebat :: FlowCmdM env err m
124 => Username -> CorpusName
125 -> Limit -> FilePath
126 -> m CorpusId
127 flowCorpusDebat u n l fp = do
128 docs <- liftIO ( splitEvery 500
129 <$> take l
130 <$> GD.readFile fp
131 :: IO [[GD.GrandDebatReference ]]
132 )
133 flowCorpus u n (Multi FR) (map (map toHyperdataDocument) docs)
134
135 flowCorpusFile :: FlowCmdM env err m
136 => Username -> CorpusName
137 -> Limit -- Limit the number of docs (for dev purpose)
138 -> TermType Lang -> FileFormat -> FilePath
139 -> m CorpusId
140 flowCorpusFile u n l la ff fp = do
141 docs <- liftIO ( splitEvery 500
142 <$> take l
143 <$> parseFile ff fp
144 )
145 flowCorpus u n la (map (map toHyperdataDocument) docs)
146
147 -- TODO query with complex query
148 flowCorpusSearchInDatabase :: FlowCmdM env err m
149 => Username -> Lang -> Text -> m CorpusId
150 flowCorpusSearchInDatabase u la q = do
151 (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus userMaster "" (Nothing :: Maybe HyperdataCorpus)
152 ids <- map fst <$> searchInDatabase cId (stemIt q)
153 flowCorpusUser la u q (Nothing :: Maybe HyperdataCorpus) ids
154
155
156 flowCorpusSearchInDatabase' :: FlowCmdM env err m
157 => Username -> Lang -> Text -> m CorpusId
158 flowCorpusSearchInDatabase' u la q = do
159 (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus userMaster "" (Nothing :: Maybe HyperdataCorpus)
160 ids <- map fst <$> searchInDatabase cId (stemIt q)
161 flowCorpusUser la u q (Nothing :: Maybe HyperdataCorpus) ids
162
163 ------------------------------------------------------------------------
164
165 flow :: (FlowCmdM env err m, FlowCorpus a, MkCorpus c)
166 => Maybe c -> Username -> CorpusName -> TermType Lang -> [[a]] -> m CorpusId
167 flow c u cn la docs = do
168 ids <- mapM (insertMasterDocs c la ) docs
169 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
170
171 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
172 => Username -> CorpusName -> TermType Lang -> [[a]] -> m CorpusId
173 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
174
175
176 flowCorpusUser :: (FlowCmdM env err m, MkCorpus c)
177 => Lang -> Username -> CorpusName -> Maybe c -> [NodeId] -> m CorpusId
178 flowCorpusUser l userName corpusName ctype ids = do
179 -- User Flow
180 (userId, _rootId, userCorpusId) <- getOrMkRootWithCorpus userName corpusName ctype
181 -- TODO: check if present already, ignore
182 _ <- Doc.add userCorpusId ids
183
184 -- User List Flow
185 --{-
186 (_masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster "" ctype
187 ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
188 userListId <- flowList userId userCorpusId ngs
189 printDebug "userListId" userListId
190 -- User Graph Flow
191 _ <- mkGraph userCorpusId userId
192 _ <- mkPhylo userCorpusId userId
193 --}
194
195 -- User Dashboard Flow
196 _ <- mkDashboard userCorpusId userId
197
198 -- Annuaire Flow
199 -- _ <- mkAnnuaire rootUserId userId
200 pure userCorpusId
201
202
203 insertMasterDocs :: ( FlowCmdM env ServantErr m
204 , FlowCorpus a
205 , MkCorpus c
206 )
207 => Maybe c -> TermType Lang -> [a] -> m [DocId]
208 insertMasterDocs c lang hs = do
209 (masterUserId, _, masterCorpusId) <- getOrMkRootWithCorpus userMaster corpusMasterName c
210
211 -- TODO Type NodeDocumentUnicised
212 let hs' = map addUniqId hs
213 ids <- insertDb masterUserId masterCorpusId hs'
214 let documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' hs')
215
216 let
217 fixLang (Unsupervised l n s m) = Unsupervised l n s m'
218 where
219 m' = case m of
220 Nothing -> trace ("buildTries here" :: String)
221 $ Just
222 $ buildTries n ( fmap toToken $ uniText
223 $ Text.intercalate " . "
224 $ List.concat
225 $ map hasText documentsWithId
226 )
227 just_m -> just_m
228 fixLang l = l
229
230 lang' = fixLang lang
231 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
232 maps <- mapNodeIdNgrams <$> documentIdWithNgrams (extractNgramsT lang') documentsWithId
233 terms2id <- insertNgrams $ Map.keys maps
234 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) maps
235
236 lId <- getOrMkList masterCorpusId masterUserId
237 _ <- insertDocNgrams lId indexedNgrams
238 pure $ map reId ids
239
240
241
242 type CorpusName = Text
243
244 getOrMkRootWithCorpus :: (HasNodeError err, MkCorpus a)
245 => Username -> CorpusName -> Maybe a
246 -> Cmd err (UserId, RootId, CorpusId)
247 getOrMkRootWithCorpus username cName c = do
248 maybeUserId <- getUser username
249 userId <- case maybeUserId of
250 Nothing -> nodeError NoUserFound
251 Just user -> pure $ userLight_id user
252
253 rootId' <- map _node_id <$> getRoot username
254
255 rootId'' <- case rootId' of
256 [] -> mkRoot username userId
257 n -> case length n >= 2 of
258 True -> nodeError ManyNodeUsers
259 False -> pure rootId'
260
261 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
262
263 corpusId'' <- if username == userMaster
264 then do
265 ns <- getCorporaWithParentId rootId
266 pure $ map _node_id ns
267 else
268 pure []
269
270 corpusId' <- if corpusId'' /= []
271 then pure corpusId''
272 else mk (Just cName) c rootId userId
273
274 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
275
276 pure (userId, rootId, corpusId)
277
278
279 ------------------------------------------------------------------------
280
281
282 class UniqId a
283 where
284 uniqId :: Lens' a (Maybe HashId)
285
286
287 instance UniqId HyperdataDocument
288 where
289 uniqId = hyperdataDocument_uniqId
290
291 instance UniqId HyperdataContact
292 where
293 uniqId = hc_uniqId
294
295 viewUniqId' :: UniqId a => a -> (HashId, a)
296 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
297 where
298 err = panic "[ERROR] Database.Flow.toInsert"
299
300
301 toInserted :: [ReturnId] -> Map HashId ReturnId
302 toInserted = Map.fromList . map (\r -> (reUniqId r, r) )
303 . filter (\r -> reInserted r == True)
304
305 data DocumentWithId a = DocumentWithId
306 { documentId :: !NodeId
307 , documentData :: !a
308 } deriving (Show)
309
310 instance HasText a => HasText (DocumentWithId a)
311 where
312 hasText (DocumentWithId _ a) = hasText a
313
314 mergeData :: Map HashId ReturnId
315 -> Map HashId a
316 -> [DocumentWithId a]
317 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
318 where
319 toDocumentWithId (hash,hpd) =
320 DocumentWithId <$> fmap reId (lookup hash rs)
321 <*> Just hpd
322
323 ------------------------------------------------------------------------
324 data DocumentIdWithNgrams a = DocumentIdWithNgrams
325 { documentWithId :: !(DocumentWithId a)
326 , document_ngrams :: !(Map Ngrams (Map NgramsType Int))
327 } deriving (Show)
328
329
330 class ExtractNgramsT h
331 where
332 extractNgramsT :: HasText h => TermType Lang -> h -> Cmd err (Map Ngrams (Map NgramsType Int))
333
334 class HasText h
335 where
336 hasText :: h -> [Text]
337
338 instance HasText HyperdataContact
339 where
340 hasText = undefined
341
342 instance ExtractNgramsT HyperdataContact
343 where
344 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
345 where
346 extract :: TermType Lang -> HyperdataContact
347 -> Cmd err (Map Ngrams (Map NgramsType Int))
348 extract _l hc' = do
349 let authors = map text2ngrams
350 $ maybe ["Nothing"] (\a -> [a])
351 $ view (hc_who . _Just . cw_lastName) hc'
352
353 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
354
355 instance HasText HyperdataDocument
356 where
357 hasText h = catMaybes [ _hyperdataDocument_title h
358 , _hyperdataDocument_abstract h
359 ]
360
361 instance ExtractNgramsT HyperdataDocument
362 where
363 extractNgramsT :: TermType Lang -> HyperdataDocument -> Cmd err (Map Ngrams (Map NgramsType Int))
364 extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
365 where
366 extractNgramsT' :: TermType Lang -> HyperdataDocument
367 -> Cmd err (Map Ngrams (Map NgramsType Int))
368 extractNgramsT' lang' doc = do
369 let source = text2ngrams
370 $ maybe "Nothing" identity
371 $ _hyperdataDocument_source doc
372
373 institutes = map text2ngrams
374 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
375 $ _hyperdataDocument_institutes doc
376
377 authors = map text2ngrams
378 $ maybe ["Nothing"] (splitOn ", ")
379 $ _hyperdataDocument_authors doc
380
381 terms' <- map text2ngrams
382 <$> map (intercalate " " . _terms_label)
383 <$> concat
384 <$> liftIO (extractTerms lang' $ hasText doc)
385
386 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
387 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
388 <> [(a', Map.singleton Authors 1) | a' <- authors ]
389 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
390
391
392 filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
393 -> Map Ngrams (Map NgramsType Int)
394 filterNgramsT s ms = Map.fromList $ map (\a -> filter' s a) $ Map.toList ms
395 where
396 filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
397 True -> (ng,y)
398 False -> (Ngrams (Text.take s' t) n , y)
399
400
401 documentIdWithNgrams :: HasNodeError err
402 => (a
403 -> Cmd err (Map Ngrams (Map NgramsType Int)))
404 -> [DocumentWithId a]
405 -> Cmd err [DocumentIdWithNgrams a]
406 documentIdWithNgrams f = mapM toDocumentIdWithNgrams
407 where
408 toDocumentIdWithNgrams d = do
409 e <- f $ documentData d
410 pure $ DocumentIdWithNgrams d e
411
412
413 -- FLOW LIST
414 -- | TODO check optimization
415 mapNodeIdNgrams :: [DocumentIdWithNgrams a]
416 -> Map Ngrams (Map NgramsType (Map NodeId Int))
417 mapNodeIdNgrams = Map.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
418 where
419 f :: DocumentIdWithNgrams a
420 -> Map Ngrams (Map NgramsType (Map NodeId Int))
421 f d = fmap (fmap (Map.singleton nId)) $ document_ngrams d
422 where
423 nId = documentId $ documentWithId d
424
425 ------------------------------------------------------------------------
426 listInsert :: FlowCmdM env err m
427 => ListId -> Map NgramsType [NgramsElement]
428 -> m ()
429 listInsert lId ngs = mapM_ (\(typeList, ngElmts)
430 -> putListNgrams lId typeList ngElmts
431 ) $ toList ngs
432
433 flowList :: FlowCmdM env err m => UserId -> CorpusId
434 -> Map NgramsType [NgramsElement]
435 -> m ListId
436 flowList uId cId ngs = do
437 lId <- getOrMkList cId uId
438 printDebug "listId flowList" lId
439 listInsert lId ngs
440 --trace (show $ List.filter (\n -> _ne_ngrams n == "versatile") $ List.concat $ Map.elems ngs) $ listInsert lId ngs
441 pure lId
442