]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Flow.hs
ServantJob, addToCorpus
[gargantext.git] / src / Gargantext / Database / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10
11 -- TODO-ACCESS:
12 -- check userId CanFillUserCorpus userCorpusId
13 -- check masterUserId CanFillMasterCorpus masterCorpusId
14
15 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
16 -- TODO-EVENTS: InsertedNodes
17
18
19 -}
20
21 {-# OPTIONS_GHC -fno-warn-orphans #-}
22
23 {-# LANGUAGE ConstraintKinds #-}
24 {-# LANGUAGE RankNTypes #-}
25 {-# LANGUAGE ConstrainedClassMethods #-}
26 {-# LANGUAGE ConstraintKinds #-}
27 {-# LANGUAGE DeriveGeneric #-}
28 {-# LANGUAGE FlexibleContexts #-}
29 {-# LANGUAGE InstanceSigs #-}
30 {-# LANGUAGE NoImplicitPrelude #-}
31 {-# LANGUAGE OverloadedStrings #-}
32
33 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
34 ( FlowCmdM
35 , flowCorpusFile
36 , flowCorpus
37 , flowCorpusSearchInDatabase
38 , getOrMkRoot
39 , getOrMkRootWithCorpus
40 )
41 where
42 import Prelude (String)
43 import Data.Either
44 import Debug.Trace (trace)
45 import Control.Lens ((^.), view, _Just)
46 import Control.Monad (mapM_)
47 import Control.Monad.IO.Class (liftIO)
48 import Data.List (concat)
49 import Data.Map (Map, lookup, toList)
50 import Data.Maybe (Maybe(..), catMaybes)
51 import Data.Monoid
52 import Data.Text (Text, splitOn, intercalate)
53 import GHC.Show (Show)
54 import Gargantext.API.Ngrams (HasRepoVar)
55 import Gargantext.API.Ngrams (NgramsElement, putListNgrams, RepoCmdM)
56 import Gargantext.Core (Lang(..))
57 import Gargantext.Core.Types (NodePoly(..), Terms(..))
58 import Gargantext.Core.Types.Individu (Username)
59 import Gargantext.Core.Flow
60 import Gargantext.Core.Types.Main
61 import Gargantext.Database.Config (userMaster, corpusMasterName)
62 import Gargantext.Database.Flow.Utils (insertDocNgrams)
63 import Gargantext.Database.Node.Contact -- (HyperdataContact(..), ContactWho(..))
64 import Gargantext.Database.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
65 import Gargantext.Database.Root (getRoot)
66 import Gargantext.Database.Schema.Ngrams -- (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
67 import Gargantext.Database.Schema.Node -- (mkRoot, mkCorpus, getOrMkList, mkGraph, {-mkPhylo,-} mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
68 import Gargantext.Database.Schema.User (getUser, UserLight(..))
69 import Gargantext.Database.TextSearch (searchInDatabase)
70 import Gargantext.Database.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
71 import Gargantext.Database.Utils (Cmd, CmdM)
72 import Gargantext.Ext.IMT (toSchoolName)
73 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
74 import Gargantext.Prelude
75 import Gargantext.Text.Terms.Eleve (buildTries, toToken)
76 import Gargantext.Text.List (buildNgramsLists,StopSize(..))
77 import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat)
78 import qualified Gargantext.Text.Corpus.API.Isidore as Isidore
79 import Gargantext.Text.Terms (TermType(..), tt_lang, extractTerms, uniText)
80 import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
81 import Gargantext.Prelude.Utils hiding (hash)
82 import System.FilePath (FilePath)
83 import qualified Data.List as List
84 import qualified Data.Map as Map
85 import qualified Data.Text as Text
86 import qualified Gargantext.Database.Node.Document.Add as Doc (add)
87 import qualified Gargantext.Text.Corpus.Parsers.GrandDebat as GD
88
89 type FlowCmdM env err m =
90 ( CmdM env err m
91 , RepoCmdM env err m
92 , HasNodeError err
93 , HasRepoVar env
94 )
95
96 ------------------------------------------------------------------------
97
98 data ApiQuery = ApiIsidoreQuery Text | ApiIsidoreAuth Text
99 -- | APIs
100 -- TODO instances
101 getDataApi :: Lang
102 -> Maybe Limit
103 -> ApiQuery
104 -> IO [HyperdataDocument]
105 getDataApi lang limit (ApiIsidoreQuery q) = Isidore.get lang limit (Just q) Nothing
106 getDataApi lang limit (ApiIsidoreAuth q) = Isidore.get lang limit Nothing (Just q)
107
108
109 -- UNUSED
110 _flowCorpusApi :: ( FlowCmdM env err m)
111 => Username -> Either CorpusName [CorpusId]
112 -> TermType Lang
113 -> Maybe Limit
114 -> ApiQuery
115 -> m CorpusId
116 _flowCorpusApi u n tt l q = do
117 docs <- liftIO $ splitEvery 500 <$> getDataApi (_tt_lang tt) l q
118 flowCorpus u n tt docs
119
120 ------------------------------------------------------------------------
121
122 -- UNUSED
123 _flowAnnuaire :: FlowCmdM env err m
124 => Username -> Either CorpusName [CorpusId] -> (TermType Lang) -> FilePath -> m AnnuaireId
125 _flowAnnuaire u n l filePath = do
126 docs <- liftIO $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
127 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
128
129 -- UNUSED
130 _flowCorpusDebat :: FlowCmdM env err m
131 => Username -> Either CorpusName [CorpusId]
132 -> Limit -> FilePath
133 -> m CorpusId
134 _flowCorpusDebat u n l fp = do
135 docs <- liftIO ( splitEvery 500
136 <$> take l
137 <$> readFile' fp
138 :: IO [[GD.GrandDebatReference ]]
139 )
140 flowCorpus u n (Multi FR) (map (map toHyperdataDocument) docs)
141
142 flowCorpusFile :: FlowCmdM env err m
143 => Username -> Either CorpusName [CorpusId]
144 -> Limit -- Limit the number of docs (for dev purpose)
145 -> TermType Lang -> FileFormat -> FilePath
146 -> m CorpusId
147 flowCorpusFile u n l la ff fp = do
148 docs <- liftIO ( splitEvery 500
149 <$> take l
150 <$> parseFile ff fp
151 )
152 flowCorpus u n la (map (map toHyperdataDocument) docs)
153
154 -- TODO query with complex query
155 flowCorpusSearchInDatabase :: FlowCmdM env err m
156 => Username -> Lang -> Text -> m CorpusId
157 flowCorpusSearchInDatabase u la q = do
158 (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus userMaster (Left "") (Nothing :: Maybe HyperdataCorpus)
159 ids <- map fst <$> searchInDatabase cId (stemIt q)
160 flowCorpusUser la u (Left q) (Nothing :: Maybe HyperdataCorpus) ids
161
162
163 -- UNUSED
164 _flowCorpusSearchInDatabaseApi :: FlowCmdM env err m
165 => Username -> Lang -> Text -> m CorpusId
166 _flowCorpusSearchInDatabaseApi u la q = do
167 (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus userMaster (Left "") (Nothing :: Maybe HyperdataCorpus)
168 ids <- map fst <$> searchInDatabase cId (stemIt q)
169 flowCorpusUser la u (Left q) (Nothing :: Maybe HyperdataCorpus) ids
170
171 ------------------------------------------------------------------------
172 -- | TODO improve the needed type to create/update a corpus
173 {- UNUSED
174 data UserInfo = Username Text
175 | UserId NodeId
176 data CorpusInfo = CorpusName Lang Text
177 | CorpusId Lang NodeId
178 -}
179
180
181 flow :: (FlowCmdM env err m, FlowCorpus a, MkCorpus c)
182 => Maybe c -> Username -> Either CorpusName [CorpusId] -> TermType Lang -> [[a]] -> m CorpusId
183 flow c u cn la docs = do
184 ids <- mapM (insertMasterDocs c la ) docs
185 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
186
187 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
188 => Username -> Either CorpusName [CorpusId] -> TermType Lang -> [[a]] -> m CorpusId
189 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
190
191 ------------------------------------------------------------------------
192 flowCorpusUser :: (FlowCmdM env err m, MkCorpus c)
193 => Lang -> Username -> Either CorpusName [CorpusId] -> Maybe c -> [NodeId] -> m CorpusId
194 flowCorpusUser l userName corpusName ctype ids = do
195 -- User Flow
196 (userId, _rootId, userCorpusId) <- getOrMkRootWithCorpus userName corpusName ctype
197 -- TODO: check if present already, ignore
198 _ <- Doc.add userCorpusId ids
199 tId <- mkNode NodeTexts userCorpusId userId
200
201 printDebug "Node Text Id" tId
202
203 -- User List Flow
204 --{-
205 (_masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster (Left "") ctype
206 ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
207 userListId <- flowList userId userCorpusId ngs
208 printDebug "userListId" userListId
209 -- User Graph Flow
210 _ <- mkDashboard userCorpusId userId
211 _ <- mkGraph userCorpusId userId
212 --_ <- mkPhylo userCorpusId userId
213 --}
214
215
216 -- Annuaire Flow
217 -- _ <- mkAnnuaire rootUserId userId
218 pure userCorpusId
219
220
221 insertMasterDocs :: ( FlowCmdM env err m
222 , FlowCorpus a
223 , MkCorpus c
224 )
225 => Maybe c -> TermType Lang -> [a] -> m [DocId]
226 insertMasterDocs c lang hs = do
227 (masterUserId, _, masterCorpusId) <- getOrMkRootWithCorpus userMaster (Left corpusMasterName) c
228
229 -- TODO Type NodeDocumentUnicised
230 let hs' = map addUniqId hs
231 ids <- insertDb masterUserId masterCorpusId hs'
232 let documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' hs')
233
234 let
235 fixLang (Unsupervised l n s m) = Unsupervised l n s m'
236 where
237 m' = case m of
238 Nothing -> trace ("buildTries here" :: String)
239 $ Just
240 $ buildTries n ( fmap toToken $ uniText
241 $ Text.intercalate " . "
242 $ List.concat
243 $ map hasText documentsWithId
244 )
245 just_m -> just_m
246 fixLang l = l
247
248 lang' = fixLang lang
249 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
250 maps <- mapNodeIdNgrams <$> documentIdWithNgrams (extractNgramsT lang') documentsWithId
251 terms2id <- insertNgrams $ Map.keys maps
252 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) maps
253
254 lId <- getOrMkList masterCorpusId masterUserId
255 _ <- insertDocNgrams lId indexedNgrams
256 pure $ map reId ids
257
258
259 type CorpusName = Text
260
261
262 getOrMkRoot :: (HasNodeError err) => Username -> Cmd err (UserId, RootId)
263 getOrMkRoot username = do
264 maybeUserId <- getUser username
265 userId <- case maybeUserId of
266 Nothing -> nodeError NoUserFound
267 Just user -> pure $ userLight_id user
268
269 rootId' <- map _node_id <$> getRoot username
270
271 rootId'' <- case rootId' of
272 [] -> mkRoot username userId
273 n -> case length n >= 2 of
274 True -> nodeError ManyNodeUsers
275 False -> pure rootId'
276
277 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
278 pure (userId, rootId)
279
280
281 getOrMkRootWithCorpus :: (HasNodeError err, MkCorpus a)
282 => Username -> Either CorpusName [CorpusId] -> Maybe a
283 -> Cmd err (UserId, RootId, CorpusId)
284 getOrMkRootWithCorpus username cName c = do
285 (userId, rootId) <- getOrMkRoot username
286 corpusId'' <- if username == userMaster
287 then do
288 ns <- getCorporaWithParentId rootId
289 pure $ map _node_id ns
290 else
291 pure $ fromRight [] cName
292
293 corpusId' <- if corpusId'' /= []
294 then pure corpusId''
295 else mk (Just $ fromLeft "Default" cName) c rootId userId
296
297 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
298
299 pure (userId, rootId, corpusId)
300
301
302 ------------------------------------------------------------------------
303 viewUniqId' :: UniqId a => a -> (HashId, a)
304 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
305 where
306 err = panic "[ERROR] Database.Flow.toInsert"
307
308
309 toInserted :: [ReturnId] -> Map HashId ReturnId
310 toInserted = Map.fromList . map (\r -> (reUniqId r, r) )
311 . filter (\r -> reInserted r == True)
312
313 data DocumentWithId a = DocumentWithId
314 { documentId :: !NodeId
315 , documentData :: !a
316 } deriving (Show)
317
318 instance HasText a => HasText (DocumentWithId a)
319 where
320 hasText (DocumentWithId _ a) = hasText a
321
322 mergeData :: Map HashId ReturnId
323 -> Map HashId a
324 -> [DocumentWithId a]
325 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
326 where
327 toDocumentWithId (hash,hpd) =
328 DocumentWithId <$> fmap reId (lookup hash rs)
329 <*> Just hpd
330
331 ------------------------------------------------------------------------
332 data DocumentIdWithNgrams a = DocumentIdWithNgrams
333 { documentWithId :: !(DocumentWithId a)
334 , document_ngrams :: !(Map Ngrams (Map NgramsType Int))
335 } deriving (Show)
336
337
338 instance HasText HyperdataContact
339 where
340 hasText = undefined
341
342 instance ExtractNgramsT HyperdataContact
343 where
344 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
345 where
346 extract :: TermType Lang -> HyperdataContact
347 -> Cmd err (Map Ngrams (Map NgramsType Int))
348 extract _l hc' = do
349 let authors = map text2ngrams
350 $ maybe ["Nothing"] (\a -> [a])
351 $ view (hc_who . _Just . cw_lastName) hc'
352
353 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
354
355 instance HasText HyperdataDocument
356 where
357 hasText h = catMaybes [ _hyperdataDocument_title h
358 , _hyperdataDocument_abstract h
359 ]
360
361 instance ExtractNgramsT HyperdataDocument
362 where
363 extractNgramsT :: TermType Lang -> HyperdataDocument -> Cmd err (Map Ngrams (Map NgramsType Int))
364 extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
365 where
366 extractNgramsT' :: TermType Lang -> HyperdataDocument
367 -> Cmd err (Map Ngrams (Map NgramsType Int))
368 extractNgramsT' lang' doc = do
369 let source = text2ngrams
370 $ maybe "Nothing" identity
371 $ _hyperdataDocument_source doc
372
373 institutes = map text2ngrams
374 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
375 $ _hyperdataDocument_institutes doc
376
377 authors = map text2ngrams
378 $ maybe ["Nothing"] (splitOn ", ")
379 $ _hyperdataDocument_authors doc
380
381 terms' <- map text2ngrams
382 <$> map (intercalate " " . _terms_label)
383 <$> concat
384 <$> liftIO (extractTerms lang' $ hasText doc)
385
386 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
387 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
388 <> [(a', Map.singleton Authors 1) | a' <- authors ]
389 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
390
391
392 filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
393 -> Map Ngrams (Map NgramsType Int)
394 filterNgramsT s ms = Map.fromList $ map (\a -> filter' s a) $ Map.toList ms
395 where
396 filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
397 True -> (ng,y)
398 False -> (Ngrams (Text.take s' t) n , y)
399
400
401 documentIdWithNgrams :: HasNodeError err
402 => (a
403 -> Cmd err (Map Ngrams (Map NgramsType Int)))
404 -> [DocumentWithId a]
405 -> Cmd err [DocumentIdWithNgrams a]
406 documentIdWithNgrams f = mapM toDocumentIdWithNgrams
407 where
408 toDocumentIdWithNgrams d = do
409 e <- f $ documentData d
410 pure $ DocumentIdWithNgrams d e
411
412
413 -- FLOW LIST
414 -- | TODO check optimization
415 mapNodeIdNgrams :: [DocumentIdWithNgrams a]
416 -> Map Ngrams (Map NgramsType (Map NodeId Int))
417 mapNodeIdNgrams = Map.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
418 where
419 f :: DocumentIdWithNgrams a
420 -> Map Ngrams (Map NgramsType (Map NodeId Int))
421 f d = fmap (fmap (Map.singleton nId)) $ document_ngrams d
422 where
423 nId = documentId $ documentWithId d
424
425 ------------------------------------------------------------------------
426 listInsert :: FlowCmdM env err m
427 => ListId -> Map NgramsType [NgramsElement]
428 -> m ()
429 listInsert lId ngs = mapM_ (\(typeList, ngElmts)
430 -> putListNgrams lId typeList ngElmts
431 ) $ toList ngs
432
433 flowList :: FlowCmdM env err m => UserId -> CorpusId
434 -> Map NgramsType [NgramsElement]
435 -> m ListId
436 flowList uId cId ngs = do
437 lId <- getOrMkList cId uId
438 printDebug "listId flowList" lId
439 listInsert lId ngs
440 --trace (show $ List.filter (\n -> _ne_ngrams n == "versatile") $ List.concat $ Map.elems ngs) $ listInsert lId ngs
441 pure lId
442