]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Flow.hs
Merge branch 'dev' into dev-phylo
[gargantext.git] / src / Gargantext / Database / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10
11 -- TODO-ACCESS:
12 -- check userId CanFillUserCorpus userCorpusId
13 -- check masterUserId CanFillMasterCorpus masterCorpusId
14
15 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
16 -- TODO-EVENTS: InsertedNodes
17
18
19 -}
20
21 {-# LANGUAGE ConstraintKinds #-}
22 {-# LANGUAGE DeriveGeneric #-}
23 {-# LANGUAGE FlexibleContexts #-}
24 {-# LANGUAGE InstanceSigs #-}
25 {-# LANGUAGE NoImplicitPrelude #-}
26 {-# LANGUAGE OverloadedStrings #-}
27 {-# LANGUAGE RankNTypes #-}
28 {-# LANGUAGE ConstrainedClassMethods #-}
29
30 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
31 where
32 import Prelude (String)
33 import Debug.Trace (trace)
34 import Control.Lens ((^.), view, Lens', _Just)
35 import Control.Monad (mapM_)
36 import Control.Monad.IO.Class (liftIO)
37 import Data.List (concat)
38 import Data.Map (Map, lookup, toList)
39 import Data.Maybe (Maybe(..), catMaybes)
40 import Data.Monoid
41 import Data.Text (Text, splitOn, intercalate)
42 import GHC.Show (Show)
43 import Gargantext.API.Ngrams (HasRepoVar)
44 import Gargantext.API.Ngrams (NgramsElement(..), putListNgrams, RepoCmdM)
45 import Gargantext.Core (Lang(..))
46 import Gargantext.Core.Types (NodePoly(..), Terms(..))
47 import Gargantext.Core.Types.Individu (Username)
48 import Gargantext.Core.Types.Main
49 import Gargantext.Database.Config (userMaster, corpusMasterName)
50 import Gargantext.Database.Flow.Utils (insertDocNgrams)
51 import Gargantext.Database.Node.Contact -- (HyperdataContact(..), ContactWho(..))
52 import Gargantext.Database.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
53 import Gargantext.Database.Root (getRoot)
54 import Gargantext.Database.Schema.Ngrams -- (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
55 import Gargantext.Database.Schema.Node -- (mkRoot, mkCorpus, getOrMkList, mkGraph, mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
56 import Gargantext.Database.Schema.User (getUser, UserLight(..))
57 import Gargantext.Database.TextSearch (searchInDatabase)
58 import Gargantext.Database.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
59 import Gargantext.Database.Utils (Cmd, CmdM)
60 import Gargantext.Ext.IMT (toSchoolName)
61 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
62 import Gargantext.Prelude
63 import Gargantext.Text.Terms.Eleve (buildTries, toToken)
64 import Gargantext.Text.List (buildNgramsLists,StopSize(..))
65 import Gargantext.Text.Parsers (parseFile, FileFormat)
66 import qualified Gargantext.Text.Parsers.IsidoreApi as Isidore
67 import Gargantext.Text.Terms (TermType(..), tt_lang, extractTerms, uniText)
68 import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
69 import Servant (ServantErr)
70 import System.FilePath (FilePath)
71 import qualified Data.List as List
72 import qualified Data.Map as Map
73 import qualified Data.Text as Text
74 import qualified Gargantext.Database.Node.Document.Add as Doc (add)
75 import qualified Gargantext.Text.Parsers.GrandDebat as GD
76
77 type FlowCmdM env err m =
78 ( CmdM env err m
79 , RepoCmdM env err m
80 , HasNodeError err
81 , HasRepoVar env
82 )
83
84 type FlowCorpus a = ( AddUniqId a
85 , UniqId a
86 , InsertDb a
87 , ExtractNgramsT a
88 , HasText a
89 )
90
91 ------------------------------------------------------------------------
92
93 data ApiQuery = ApiIsidoreQuery Text | ApiIsidoreAuth Text
94 -- | APIs
95 -- TODO instances
96 getDataApi :: Lang
97 -> Maybe Limit
98 -> ApiQuery
99 -> IO [HyperdataDocument]
100 getDataApi lang limit (ApiIsidoreQuery q) = Isidore.get lang limit (Just q) Nothing
101 getDataApi lang limit (ApiIsidoreAuth q) = Isidore.get lang limit Nothing (Just q)
102
103
104 flowCorpusApi :: ( FlowCmdM env ServantErr m)
105 => Username -> CorpusName
106 -> TermType Lang
107 -> Maybe Limit
108 -> ApiQuery
109 -> m CorpusId
110 flowCorpusApi u n tt l q = do
111 docs <- liftIO $ splitEvery 500 <$> getDataApi (_tt_lang tt) l q
112 flowCorpus u n tt docs
113
114 ------------------------------------------------------------------------
115
116
117 flowAnnuaire :: FlowCmdM env ServantErr m
118 => Username -> CorpusName -> (TermType Lang) -> FilePath -> m AnnuaireId
119 flowAnnuaire u n l filePath = do
120 docs <- liftIO $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
121 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
122
123
124 flowCorpusDebat :: FlowCmdM env ServantErr m
125 => Username -> CorpusName
126 -> Limit -> FilePath
127 -> m CorpusId
128 flowCorpusDebat u n l fp = do
129 docs <- liftIO ( splitEvery 500
130 <$> take l
131 <$> GD.readFile fp
132 :: IO [[GD.GrandDebatReference ]]
133 )
134 flowCorpus u n (Multi FR) (map (map toHyperdataDocument) docs)
135
136 flowCorpusFile :: FlowCmdM env ServantErr m
137 => Username -> CorpusName
138 -> Limit -- Limit the number of docs (for dev purpose)
139 -> TermType Lang -> FileFormat -> FilePath
140 -> m CorpusId
141 flowCorpusFile u n l la ff fp = do
142 docs <- liftIO ( splitEvery 500
143 <$> take l
144 <$> parseFile ff fp
145 )
146 flowCorpus u n la (map (map toHyperdataDocument) docs)
147
148 -- TODO query with complex query
149 flowCorpusSearchInDatabase :: FlowCmdM env err m
150 => Username -> Lang -> Text -> m CorpusId
151 flowCorpusSearchInDatabase u la q = do
152 (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus userMaster "" (Nothing :: Maybe HyperdataCorpus)
153 ids <- map fst <$> searchInDatabase cId (stemIt q)
154 flowCorpusUser la u q (Nothing :: Maybe HyperdataCorpus) ids
155
156
157 flowCorpusSearchInDatabase' :: FlowCmdM env ServantErr m
158 => Username -> Lang -> Text -> m CorpusId
159 flowCorpusSearchInDatabase' u la q = do
160 (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus userMaster "" (Nothing :: Maybe HyperdataCorpus)
161 ids <- map fst <$> searchInDatabase cId (stemIt q)
162 flowCorpusUser la u q (Nothing :: Maybe HyperdataCorpus) ids
163
164 ------------------------------------------------------------------------
165
166 flow :: (FlowCmdM env ServantErr m, FlowCorpus a, MkCorpus c)
167 => Maybe c -> Username -> CorpusName -> TermType Lang -> [[a]] -> m CorpusId
168 flow c u cn la docs = do
169 ids <- mapM (insertMasterDocs c la ) docs
170 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
171
172 flowCorpus :: (FlowCmdM env ServantErr m, FlowCorpus a)
173 => Username -> CorpusName -> TermType Lang -> [[a]] -> m CorpusId
174 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
175
176
177 flowCorpusUser :: (FlowCmdM env err m, MkCorpus c)
178 => Lang -> Username -> CorpusName -> Maybe c -> [NodeId] -> m CorpusId
179 flowCorpusUser l userName corpusName ctype ids = do
180 -- User Flow
181 (userId, _rootId, userCorpusId) <- getOrMkRootWithCorpus userName corpusName ctype
182 -- TODO: check if present already, ignore
183 _ <- Doc.add userCorpusId ids
184
185 -- User List Flow
186 --{-
187 (_masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster "" ctype
188 ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
189 userListId <- flowList userId userCorpusId ngs
190 printDebug "userListId" userListId
191 -- User Graph Flow
192 _ <- mkGraph userCorpusId userId
193 --}
194
195 -- User Dashboard Flow
196 _ <- mkDashboard userCorpusId userId
197
198 -- Annuaire Flow
199 -- _ <- mkAnnuaire rootUserId userId
200 pure userCorpusId
201
202
203 insertMasterDocs :: ( FlowCmdM env ServantErr m
204 , FlowCorpus a
205 , MkCorpus c
206 )
207 => Maybe c -> TermType Lang -> [a] -> m [DocId]
208 insertMasterDocs c lang hs = do
209 (masterUserId, _, masterCorpusId) <- getOrMkRootWithCorpus userMaster corpusMasterName c
210
211 -- TODO Type NodeDocumentUnicised
212 let hs' = map addUniqId hs
213 ids <- insertDb masterUserId masterCorpusId hs'
214 let documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' hs')
215
216 let
217 fixLang (Unsupervised l n s m) = Unsupervised l n s m'
218 where
219 m' = case m of
220 Nothing -> trace ("buildTries here" :: String) $ Just $ buildTries n (fmap toToken $ uniText $ Text.intercalate " " $ List.concat $ map hasText documentsWithId)
221 m'' -> m''
222 fixLang l = l
223
224 lang' = fixLang lang
225 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
226 maps <- mapNodeIdNgrams <$> documentIdWithNgrams (extractNgramsT lang') documentsWithId
227 terms2id <- insertNgrams $ Map.keys maps
228 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) maps
229
230 lId <- getOrMkList masterCorpusId masterUserId
231 _ <- insertDocNgrams lId indexedNgrams
232 pure $ map reId ids
233
234
235
236 type CorpusName = Text
237
238 getOrMkRootWithCorpus :: (HasNodeError err, MkCorpus a)
239 => Username -> CorpusName -> Maybe a
240 -> Cmd err (UserId, RootId, CorpusId)
241 getOrMkRootWithCorpus username cName c = do
242 maybeUserId <- getUser username
243 userId <- case maybeUserId of
244 Nothing -> nodeError NoUserFound
245 Just user -> pure $ userLight_id user
246
247 rootId' <- map _node_id <$> getRoot username
248
249 rootId'' <- case rootId' of
250 [] -> mkRoot username userId
251 n -> case length n >= 2 of
252 True -> nodeError ManyNodeUsers
253 False -> pure rootId'
254
255 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
256
257 corpusId'' <- if username == userMaster
258 then do
259 ns <- getCorporaWithParentId rootId
260 pure $ map _node_id ns
261 else
262 pure []
263
264 corpusId' <- if corpusId'' /= []
265 then pure corpusId''
266 else mk (Just cName) c rootId userId
267
268 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
269
270 pure (userId, rootId, corpusId)
271
272
273 ------------------------------------------------------------------------
274
275
276 class UniqId a
277 where
278 uniqId :: Lens' a (Maybe HashId)
279
280
281 instance UniqId HyperdataDocument
282 where
283 uniqId = hyperdataDocument_uniqId
284
285 instance UniqId HyperdataContact
286 where
287 uniqId = hc_uniqId
288
289 viewUniqId' :: UniqId a => a -> (HashId, a)
290 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
291 where
292 err = panic "[ERROR] Database.Flow.toInsert"
293
294
295 toInserted :: [ReturnId] -> Map HashId ReturnId
296 toInserted = Map.fromList . map (\r -> (reUniqId r, r) )
297 . filter (\r -> reInserted r == True)
298
299 data DocumentWithId a = DocumentWithId
300 { documentId :: !NodeId
301 , documentData :: !a
302 } deriving (Show)
303
304 instance HasText a => HasText (DocumentWithId a)
305 where
306 hasText (DocumentWithId _ a) = hasText a
307
308 mergeData :: Map HashId ReturnId
309 -> Map HashId a
310 -> [DocumentWithId a]
311 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
312 where
313 toDocumentWithId (hash,hpd) =
314 DocumentWithId <$> fmap reId (lookup hash rs)
315 <*> Just hpd
316
317 ------------------------------------------------------------------------
318 data DocumentIdWithNgrams a = DocumentIdWithNgrams
319 { documentWithId :: !(DocumentWithId a)
320 , document_ngrams :: !(Map Ngrams (Map NgramsType Int))
321 } deriving (Show)
322
323
324 class ExtractNgramsT h
325 where
326 extractNgramsT :: HasText h => TermType Lang -> h -> Cmd err (Map Ngrams (Map NgramsType Int))
327
328 class HasText h
329 where
330 hasText :: h -> [Text]
331
332 instance HasText HyperdataContact
333 where
334 hasText = undefined
335
336 instance ExtractNgramsT HyperdataContact
337 where
338 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
339 where
340 extract :: TermType Lang -> HyperdataContact
341 -> Cmd err (Map Ngrams (Map NgramsType Int))
342 extract _l hc' = do
343 let authors = map text2ngrams
344 $ maybe ["Nothing"] (\a -> [a])
345 $ view (hc_who . _Just . cw_lastName) hc'
346
347 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
348
349 instance HasText HyperdataDocument
350 where
351 hasText h = catMaybes [ _hyperdataDocument_title h
352 , _hyperdataDocument_abstract h
353 ]
354
355 instance ExtractNgramsT HyperdataDocument
356 where
357 extractNgramsT :: TermType Lang -> HyperdataDocument -> Cmd err (Map Ngrams (Map NgramsType Int))
358 extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
359 where
360 extractNgramsT' :: TermType Lang -> HyperdataDocument
361 -> Cmd err (Map Ngrams (Map NgramsType Int))
362 extractNgramsT' lang' doc = do
363 let source = text2ngrams
364 $ maybe "Nothing" identity
365 $ _hyperdataDocument_source doc
366
367 institutes = map text2ngrams
368 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
369 $ _hyperdataDocument_institutes doc
370
371 authors = map text2ngrams
372 $ maybe ["Nothing"] (splitOn ", ")
373 $ _hyperdataDocument_authors doc
374
375 terms' <- map text2ngrams
376 <$> map (intercalate " " . _terms_label)
377 <$> concat
378 <$> liftIO (extractTerms lang' $ hasText doc)
379
380 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
381 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
382 <> [(a', Map.singleton Authors 1) | a' <- authors ]
383 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
384
385
386 filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
387 -> Map Ngrams (Map NgramsType Int)
388 filterNgramsT s ms = Map.fromList $ map (\a -> filter' s a) $ Map.toList ms
389 where
390 filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
391 True -> (ng,y)
392 False -> (Ngrams (Text.take s' t) n , y)
393
394
395 documentIdWithNgrams :: HasNodeError err
396 => (a
397 -> Cmd err (Map Ngrams (Map NgramsType Int)))
398 -> [DocumentWithId a]
399 -> Cmd err [DocumentIdWithNgrams a]
400 documentIdWithNgrams f = mapM toDocumentIdWithNgrams
401 where
402 toDocumentIdWithNgrams d = do
403 e <- f $ documentData d
404 pure $ DocumentIdWithNgrams d e
405
406
407 -- FLOW LIST
408 -- | TODO check optimization
409 mapNodeIdNgrams :: [DocumentIdWithNgrams a]
410 -> Map Ngrams (Map NgramsType (Map NodeId Int))
411 mapNodeIdNgrams = Map.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
412 where
413 f :: DocumentIdWithNgrams a
414 -> Map Ngrams (Map NgramsType (Map NodeId Int))
415 f d = fmap (fmap (Map.singleton nId)) $ document_ngrams d
416 where
417 nId = documentId $ documentWithId d
418
419 ------------------------------------------------------------------------
420 listInsert :: FlowCmdM env err m
421 => ListId -> Map NgramsType [NgramsElement]
422 -> m ()
423 listInsert lId ngs = mapM_ (\(typeList, ngElmts)
424 -> putListNgrams lId typeList ngElmts
425 ) $ toList ngs
426
427 flowList :: FlowCmdM env err m => UserId -> CorpusId
428 -> Map NgramsType [NgramsElement]
429 -> m ListId
430 flowList uId cId ngs = do
431 lId <- getOrMkList cId uId
432 printDebug "listId flowList" lId
433 listInsert lId ngs
434 --trace (show $ List.filter (\n -> _ne_ngrams n == "versatile") $ List.concat $ Map.elems ngs) $ listInsert lId ngs
435 pure lId
436