]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Flow.hs
[COSMETICS]
[gargantext.git] / src / Gargantext / Database / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10
11 -- TODO-ACCESS:
12 -- check userId CanFillUserCorpus userCorpusId
13 -- check masterUserId CanFillMasterCorpus masterCorpusId
14
15 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
16 -- TODO-EVENTS: InsertedNodes
17
18
19 -}
20
21 {-# OPTIONS_GHC -fno-warn-orphans #-}
22
23 {-# LANGUAGE ConstraintKinds #-}
24 {-# LANGUAGE RankNTypes #-}
25 {-# LANGUAGE ConstrainedClassMethods #-}
26 {-# LANGUAGE ConstraintKinds #-}
27 {-# LANGUAGE DeriveGeneric #-}
28 {-# LANGUAGE FlexibleContexts #-}
29 {-# LANGUAGE InstanceSigs #-}
30 {-# LANGUAGE NoImplicitPrelude #-}
31 {-# LANGUAGE OverloadedStrings #-}
32
33 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
34 where
35 import Prelude (String)
36 import Debug.Trace (trace)
37 import Control.Lens ((^.), view, _Just)
38 import Control.Monad (mapM_)
39 import Control.Monad.IO.Class (liftIO)
40 import Data.List (concat)
41 import Data.Map (Map, lookup, toList)
42 import Data.Maybe (Maybe(..), catMaybes)
43 import Data.Monoid
44 import Data.Text (Text, splitOn, intercalate)
45 import GHC.Show (Show)
46 import Gargantext.API.Ngrams (HasRepoVar)
47 import Gargantext.API.Ngrams (NgramsElement(..), putListNgrams, RepoCmdM)
48 import Gargantext.Core (Lang(..))
49 import Gargantext.Core.Types (NodePoly(..), Terms(..))
50 import Gargantext.Core.Types.Individu (Username)
51 import Gargantext.Core.Flow
52 import Gargantext.Core.Types.Main
53 import Gargantext.Database.Config (userMaster, corpusMasterName)
54 import Gargantext.Database.Flow.Utils (insertDocNgrams)
55 import Gargantext.Database.Node.Contact -- (HyperdataContact(..), ContactWho(..))
56 import Gargantext.Database.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
57 import Gargantext.Database.Root (getRoot)
58 import Gargantext.Database.Schema.Ngrams -- (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
59 import Gargantext.Database.Schema.Node -- (mkRoot, mkCorpus, getOrMkList, mkGraph, mkPhylo, mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
60 import Gargantext.Database.Schema.User (getUser, UserLight(..))
61 import Gargantext.Database.TextSearch (searchInDatabase)
62 import Gargantext.Database.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
63 import Gargantext.Database.Utils (Cmd, CmdM)
64 import Gargantext.Ext.IMT (toSchoolName)
65 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
66 import Gargantext.Prelude
67 import Gargantext.Text.Terms.Eleve (buildTries, toToken)
68 import Gargantext.Text.List (buildNgramsLists,StopSize(..))
69 import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat)
70 import qualified Gargantext.Text.Corpus.API.Isidore as Isidore
71 import Gargantext.Text.Terms (TermType(..), tt_lang, extractTerms, uniText)
72 import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
73 import System.FilePath (FilePath)
74 import qualified Data.List as List
75 import qualified Data.Map as Map
76 import qualified Data.Text as Text
77 import qualified Gargantext.Database.Node.Document.Add as Doc (add)
78 import qualified Gargantext.Text.Corpus.Parsers.GrandDebat as GD
79
80 type FlowCmdM env err m =
81 ( CmdM env err m
82 , RepoCmdM env err m
83 , HasNodeError err
84 , HasRepoVar env
85 )
86
87 ------------------------------------------------------------------------
88
89 data ApiQuery = ApiIsidoreQuery Text | ApiIsidoreAuth Text
90 -- | APIs
91 -- TODO instances
92 getDataApi :: Lang
93 -> Maybe Limit
94 -> ApiQuery
95 -> IO [HyperdataDocument]
96 getDataApi lang limit (ApiIsidoreQuery q) = Isidore.get lang limit (Just q) Nothing
97 getDataApi lang limit (ApiIsidoreAuth q) = Isidore.get lang limit Nothing (Just q)
98
99
100 flowCorpusApi :: ( FlowCmdM env err m)
101 => Username -> CorpusName
102 -> TermType Lang
103 -> Maybe Limit
104 -> ApiQuery
105 -> m CorpusId
106 flowCorpusApi u n tt l q = do
107 docs <- liftIO $ splitEvery 500 <$> getDataApi (_tt_lang tt) l q
108 flowCorpus u n tt docs
109
110 ------------------------------------------------------------------------
111
112 flowAnnuaire :: FlowCmdM env err m
113 => Username -> CorpusName -> (TermType Lang) -> FilePath -> m AnnuaireId
114 flowAnnuaire u n l filePath = do
115 docs <- liftIO $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
116 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
117
118
119 flowCorpusDebat :: FlowCmdM env err m
120 => Username -> CorpusName
121 -> Limit -> FilePath
122 -> m CorpusId
123 flowCorpusDebat u n l fp = do
124 docs <- liftIO ( splitEvery 500
125 <$> take l
126 <$> GD.readFile fp
127 :: IO [[GD.GrandDebatReference ]]
128 )
129 flowCorpus u n (Multi FR) (map (map toHyperdataDocument) docs)
130
131 flowCorpusFile :: FlowCmdM env err m
132 => Username -> CorpusName
133 -> Limit -- Limit the number of docs (for dev purpose)
134 -> TermType Lang -> FileFormat -> FilePath
135 -> m CorpusId
136 flowCorpusFile u n l la ff fp = do
137 docs <- liftIO ( splitEvery 500
138 <$> take l
139 <$> parseFile ff fp
140 )
141 flowCorpus u n la (map (map toHyperdataDocument) docs)
142
143 -- TODO query with complex query
144 flowCorpusSearchInDatabase :: FlowCmdM env err m
145 => Username -> Lang -> Text -> m CorpusId
146 flowCorpusSearchInDatabase u la q = do
147 (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus userMaster "" (Nothing :: Maybe HyperdataCorpus)
148 ids <- map fst <$> searchInDatabase cId (stemIt q)
149 flowCorpusUser la u q (Nothing :: Maybe HyperdataCorpus) ids
150
151
152 flowCorpusSearchInDatabaseApi :: FlowCmdM env err m
153 => Username -> Lang -> Text -> m CorpusId
154 flowCorpusSearchInDatabaseApi u la q = do
155 (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus userMaster "" (Nothing :: Maybe HyperdataCorpus)
156 ids <- map fst <$> searchInDatabase cId (stemIt q)
157 flowCorpusUser la u q (Nothing :: Maybe HyperdataCorpus) ids
158
159 ------------------------------------------------------------------------
160 -- | TODO improve the needed type to create/update a corpus
161 data UserInfo = Username Text
162 | UserId NodeId
163 data CorpusInfo = CorpusName Lang Text
164 | CorpusId Lang NodeId
165
166
167 flow :: (FlowCmdM env err m, FlowCorpus a, MkCorpus c)
168 => Maybe c -> Username -> CorpusName -> TermType Lang -> [[a]] -> m CorpusId
169 flow c u cn la docs = do
170 ids <- mapM (insertMasterDocs c la ) docs
171 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
172
173 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
174 => Username -> CorpusName -> TermType Lang -> [[a]] -> m CorpusId
175 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
176
177 ------------------------------------------------------------------------
178
179
180 flowCorpusUser :: (FlowCmdM env err m, MkCorpus c)
181 => Lang -> Username -> CorpusName -> Maybe c -> [NodeId] -> m CorpusId
182 flowCorpusUser l userName corpusName ctype ids = do
183 -- User Flow
184 (userId, _rootId, userCorpusId) <- getOrMkRootWithCorpus userName corpusName ctype
185 -- TODO: check if present already, ignore
186 _ <- Doc.add userCorpusId ids
187
188 -- User List Flow
189 --{-
190 (_masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster "" ctype
191 ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
192 userListId <- flowList userId userCorpusId ngs
193 printDebug "userListId" userListId
194 -- User Graph Flow
195 _ <- mkGraph userCorpusId userId
196 _ <- mkPhylo userCorpusId userId
197 --}
198
199 -- User Dashboard Flow
200 _ <- mkDashboard userCorpusId userId
201
202 -- Annuaire Flow
203 -- _ <- mkAnnuaire rootUserId userId
204 pure userCorpusId
205
206
207 insertMasterDocs :: ( FlowCmdM env err m
208 , FlowCorpus a
209 , MkCorpus c
210 )
211 => Maybe c -> TermType Lang -> [a] -> m [DocId]
212 insertMasterDocs c lang hs = do
213 (masterUserId, _, masterCorpusId) <- getOrMkRootWithCorpus userMaster corpusMasterName c
214
215 -- TODO Type NodeDocumentUnicised
216 let hs' = map addUniqId hs
217 ids <- insertDb masterUserId masterCorpusId hs'
218 let documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' hs')
219
220 let
221 fixLang (Unsupervised l n s m) = Unsupervised l n s m'
222 where
223 m' = case m of
224 Nothing -> trace ("buildTries here" :: String)
225 $ Just
226 $ buildTries n ( fmap toToken $ uniText
227 $ Text.intercalate " . "
228 $ List.concat
229 $ map hasText documentsWithId
230 )
231 just_m -> just_m
232 fixLang l = l
233
234 lang' = fixLang lang
235 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
236 maps <- mapNodeIdNgrams <$> documentIdWithNgrams (extractNgramsT lang') documentsWithId
237 terms2id <- insertNgrams $ Map.keys maps
238 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) maps
239
240 lId <- getOrMkList masterCorpusId masterUserId
241 _ <- insertDocNgrams lId indexedNgrams
242 pure $ map reId ids
243
244
245
246 type CorpusName = Text
247
248 getOrMkRootWithCorpus :: (HasNodeError err, MkCorpus a)
249 => Username -> CorpusName -> Maybe a
250 -> Cmd err (UserId, RootId, CorpusId)
251 getOrMkRootWithCorpus username cName c = do
252 maybeUserId <- getUser username
253 userId <- case maybeUserId of
254 Nothing -> nodeError NoUserFound
255 Just user -> pure $ userLight_id user
256
257 rootId' <- map _node_id <$> getRoot username
258
259 rootId'' <- case rootId' of
260 [] -> mkRoot username userId
261 n -> case length n >= 2 of
262 True -> nodeError ManyNodeUsers
263 False -> pure rootId'
264
265 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
266
267 corpusId'' <- if username == userMaster
268 then do
269 ns <- getCorporaWithParentId rootId
270 pure $ map _node_id ns
271 else
272 pure []
273
274 corpusId' <- if corpusId'' /= []
275 then pure corpusId''
276 else mk (Just cName) c rootId userId
277
278 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
279
280 pure (userId, rootId, corpusId)
281
282
283 ------------------------------------------------------------------------
284
285
286 viewUniqId' :: UniqId a => a -> (HashId, a)
287 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
288 where
289 err = panic "[ERROR] Database.Flow.toInsert"
290
291
292 toInserted :: [ReturnId] -> Map HashId ReturnId
293 toInserted = Map.fromList . map (\r -> (reUniqId r, r) )
294 . filter (\r -> reInserted r == True)
295
296 data DocumentWithId a = DocumentWithId
297 { documentId :: !NodeId
298 , documentData :: !a
299 } deriving (Show)
300
301 instance HasText a => HasText (DocumentWithId a)
302 where
303 hasText (DocumentWithId _ a) = hasText a
304
305 mergeData :: Map HashId ReturnId
306 -> Map HashId a
307 -> [DocumentWithId a]
308 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
309 where
310 toDocumentWithId (hash,hpd) =
311 DocumentWithId <$> fmap reId (lookup hash rs)
312 <*> Just hpd
313
314 ------------------------------------------------------------------------
315 data DocumentIdWithNgrams a = DocumentIdWithNgrams
316 { documentWithId :: !(DocumentWithId a)
317 , document_ngrams :: !(Map Ngrams (Map NgramsType Int))
318 } deriving (Show)
319
320
321 instance HasText HyperdataContact
322 where
323 hasText = undefined
324
325 instance ExtractNgramsT HyperdataContact
326 where
327 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
328 where
329 extract :: TermType Lang -> HyperdataContact
330 -> Cmd err (Map Ngrams (Map NgramsType Int))
331 extract _l hc' = do
332 let authors = map text2ngrams
333 $ maybe ["Nothing"] (\a -> [a])
334 $ view (hc_who . _Just . cw_lastName) hc'
335
336 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
337
338 instance HasText HyperdataDocument
339 where
340 hasText h = catMaybes [ _hyperdataDocument_title h
341 , _hyperdataDocument_abstract h
342 ]
343
344 instance ExtractNgramsT HyperdataDocument
345 where
346 extractNgramsT :: TermType Lang -> HyperdataDocument -> Cmd err (Map Ngrams (Map NgramsType Int))
347 extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
348 where
349 extractNgramsT' :: TermType Lang -> HyperdataDocument
350 -> Cmd err (Map Ngrams (Map NgramsType Int))
351 extractNgramsT' lang' doc = do
352 let source = text2ngrams
353 $ maybe "Nothing" identity
354 $ _hyperdataDocument_source doc
355
356 institutes = map text2ngrams
357 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
358 $ _hyperdataDocument_institutes doc
359
360 authors = map text2ngrams
361 $ maybe ["Nothing"] (splitOn ", ")
362 $ _hyperdataDocument_authors doc
363
364 terms' <- map text2ngrams
365 <$> map (intercalate " " . _terms_label)
366 <$> concat
367 <$> liftIO (extractTerms lang' $ hasText doc)
368
369 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
370 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
371 <> [(a', Map.singleton Authors 1) | a' <- authors ]
372 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
373
374
375 filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
376 -> Map Ngrams (Map NgramsType Int)
377 filterNgramsT s ms = Map.fromList $ map (\a -> filter' s a) $ Map.toList ms
378 where
379 filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
380 True -> (ng,y)
381 False -> (Ngrams (Text.take s' t) n , y)
382
383
384 documentIdWithNgrams :: HasNodeError err
385 => (a
386 -> Cmd err (Map Ngrams (Map NgramsType Int)))
387 -> [DocumentWithId a]
388 -> Cmd err [DocumentIdWithNgrams a]
389 documentIdWithNgrams f = mapM toDocumentIdWithNgrams
390 where
391 toDocumentIdWithNgrams d = do
392 e <- f $ documentData d
393 pure $ DocumentIdWithNgrams d e
394
395
396 -- FLOW LIST
397 -- | TODO check optimization
398 mapNodeIdNgrams :: [DocumentIdWithNgrams a]
399 -> Map Ngrams (Map NgramsType (Map NodeId Int))
400 mapNodeIdNgrams = Map.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
401 where
402 f :: DocumentIdWithNgrams a
403 -> Map Ngrams (Map NgramsType (Map NodeId Int))
404 f d = fmap (fmap (Map.singleton nId)) $ document_ngrams d
405 where
406 nId = documentId $ documentWithId d
407
408 ------------------------------------------------------------------------
409 listInsert :: FlowCmdM env err m
410 => ListId -> Map NgramsType [NgramsElement]
411 -> m ()
412 listInsert lId ngs = mapM_ (\(typeList, ngElmts)
413 -> putListNgrams lId typeList ngElmts
414 ) $ toList ngs
415
416 flowList :: FlowCmdM env err m => UserId -> CorpusId
417 -> Map NgramsType [NgramsElement]
418 -> m ListId
419 flowList uId cId ngs = do
420 lId <- getOrMkList cId uId
421 printDebug "listId flowList" lId
422 listInsert lId ngs
423 --trace (show $ List.filter (\n -> _ne_ngrams n == "versatile") $ List.concat $ Map.elems ngs) $ listInsert lId ngs
424 pure lId
425