]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Flow.hs
[REFACTO] WIP
[gargantext.git] / src / Gargantext / Database / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10
11 -- TODO-ACCESS:
12 -- check userId CanFillUserCorpus userCorpusId
13 -- check masterUserId CanFillMasterCorpus masterCorpusId
14
15 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
16 -- TODO-EVENTS: InsertedNodes
17
18
19 -}
20
21 {-# OPTIONS_GHC -fno-warn-orphans #-}
22
23 {-# LANGUAGE ConstraintKinds #-}
24 {-# LANGUAGE RankNTypes #-}
25 {-# LANGUAGE ConstrainedClassMethods #-}
26 {-# LANGUAGE ConstraintKinds #-}
27 {-# LANGUAGE DeriveGeneric #-}
28 {-# LANGUAGE FlexibleContexts #-}
29 {-# LANGUAGE InstanceSigs #-}
30 {-# LANGUAGE NoImplicitPrelude #-}
31 {-# LANGUAGE OverloadedStrings #-}
32
33 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
34 where
35 import Prelude (String)
36 import Data.Either
37 import Debug.Trace (trace)
38 import Control.Lens ((^.), view, _Just)
39 import Control.Monad (mapM_)
40 import Control.Monad.IO.Class (liftIO)
41 import Data.List (concat)
42 import Data.Map (Map, lookup, toList)
43 import Data.Maybe (Maybe(..), catMaybes)
44 import Data.Monoid
45 import Data.Text (Text, splitOn, intercalate)
46 import GHC.Show (Show)
47 import Gargantext.API.Ngrams (HasRepoVar)
48 import Gargantext.API.Ngrams (NgramsElement(..), putListNgrams, RepoCmdM)
49 import Gargantext.Core (Lang(..))
50 import Gargantext.Core.Types (NodePoly(..), Terms(..))
51 import Gargantext.Core.Types.Individu (Username)
52 import Gargantext.Core.Flow
53 import Gargantext.Core.Types.Main
54 import Gargantext.Database.Config (userMaster, corpusMasterName)
55 import Gargantext.Database.Flow.Utils (insertDocNgrams)
56 import Gargantext.Database.Node.Contact -- (HyperdataContact(..), ContactWho(..))
57 import Gargantext.Database.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
58 import Gargantext.Database.Root (getRoot)
59 import Gargantext.Database.Schema.Ngrams -- (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
60 import Gargantext.Database.Schema.Node -- (mkRoot, mkCorpus, getOrMkList, mkGraph, mkPhylo, mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
61 import Gargantext.Database.Schema.User (getUser, UserLight(..))
62 import Gargantext.Database.TextSearch (searchInDatabase)
63 import Gargantext.Database.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
64 import Gargantext.Database.Utils (Cmd, CmdM)
65 import Gargantext.Ext.IMT (toSchoolName)
66 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
67 import Gargantext.Prelude
68 import Gargantext.Text.Terms.Eleve (buildTries, toToken)
69 import Gargantext.Text.List (buildNgramsLists,StopSize(..))
70 import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat)
71 import qualified Gargantext.Text.Corpus.API.Isidore as Isidore
72 import Gargantext.Text.Terms (TermType(..), tt_lang, extractTerms, uniText)
73 import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
74 import Gargantext.Prelude.Utils hiding (hash)
75 import System.FilePath (FilePath)
76 import qualified Data.List as List
77 import qualified Data.Map as Map
78 import qualified Data.Text as Text
79 import qualified Gargantext.Database.Node.Document.Add as Doc (add)
80 import qualified Gargantext.Text.Corpus.Parsers.GrandDebat as GD
81
82 type FlowCmdM env err m =
83 ( CmdM env err m
84 , RepoCmdM env err m
85 , HasNodeError err
86 , HasRepoVar env
87 )
88
89 ------------------------------------------------------------------------
90
91 data ApiQuery = ApiIsidoreQuery Text | ApiIsidoreAuth Text
92 -- | APIs
93 -- TODO instances
94 getDataApi :: Lang
95 -> Maybe Limit
96 -> ApiQuery
97 -> IO [HyperdataDocument]
98 getDataApi lang limit (ApiIsidoreQuery q) = Isidore.get lang limit (Just q) Nothing
99 getDataApi lang limit (ApiIsidoreAuth q) = Isidore.get lang limit Nothing (Just q)
100
101
102 flowCorpusApi :: ( FlowCmdM env err m)
103 => Username -> Either CorpusName [CorpusId]
104 -> TermType Lang
105 -> Maybe Limit
106 -> ApiQuery
107 -> m CorpusId
108 flowCorpusApi u n tt l q = do
109 docs <- liftIO $ splitEvery 500 <$> getDataApi (_tt_lang tt) l q
110 flowCorpus u n tt docs
111
112 ------------------------------------------------------------------------
113
114 flowAnnuaire :: FlowCmdM env err m
115 => Username -> Either CorpusName [CorpusId] -> (TermType Lang) -> FilePath -> m AnnuaireId
116 flowAnnuaire u n l filePath = do
117 docs <- liftIO $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
118 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
119
120
121 flowCorpusDebat :: FlowCmdM env err m
122 => Username -> Either CorpusName [CorpusId]
123 -> Limit -> FilePath
124 -> m CorpusId
125 flowCorpusDebat u n l fp = do
126 docs <- liftIO ( splitEvery 500
127 <$> take l
128 <$> readFile' fp
129 :: IO [[GD.GrandDebatReference ]]
130 )
131 flowCorpus u n (Multi FR) (map (map toHyperdataDocument) docs)
132
133 flowCorpusFile :: FlowCmdM env err m
134 => Username -> Either CorpusName [CorpusId]
135 -> Limit -- Limit the number of docs (for dev purpose)
136 -> TermType Lang -> FileFormat -> FilePath
137 -> m CorpusId
138 flowCorpusFile u n l la ff fp = do
139 docs <- liftIO ( splitEvery 500
140 <$> take l
141 <$> parseFile ff fp
142 )
143 flowCorpus u n la (map (map toHyperdataDocument) docs)
144
145 -- TODO query with complex query
146 flowCorpusSearchInDatabase :: FlowCmdM env err m
147 => Username -> Lang -> Text -> m CorpusId
148 flowCorpusSearchInDatabase u la q = do
149 (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus userMaster (Left "") (Nothing :: Maybe HyperdataCorpus)
150 ids <- map fst <$> searchInDatabase cId (stemIt q)
151 flowCorpusUser la u (Left q) (Nothing :: Maybe HyperdataCorpus) ids
152
153
154 flowCorpusSearchInDatabaseApi :: FlowCmdM env err m
155 => Username -> Lang -> Text -> m CorpusId
156 flowCorpusSearchInDatabaseApi u la q = do
157 (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus userMaster (Left "") (Nothing :: Maybe HyperdataCorpus)
158 ids <- map fst <$> searchInDatabase cId (stemIt q)
159 flowCorpusUser la u (Left q) (Nothing :: Maybe HyperdataCorpus) ids
160
161 ------------------------------------------------------------------------
162 -- | TODO improve the needed type to create/update a corpus
163 data UserInfo = Username Text
164 | UserId NodeId
165 data CorpusInfo = CorpusName Lang Text
166 | CorpusId Lang NodeId
167
168
169 flow :: (FlowCmdM env err m, FlowCorpus a, MkCorpus c)
170 => Maybe c -> Username -> Either CorpusName [CorpusId] -> TermType Lang -> [[a]] -> m CorpusId
171 flow c u cn la docs = do
172 ids <- mapM (insertMasterDocs c la ) docs
173 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
174
175 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
176 => Username -> Either CorpusName [CorpusId] -> TermType Lang -> [[a]] -> m CorpusId
177 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
178
179 ------------------------------------------------------------------------
180
181
182 flowCorpusUser :: (FlowCmdM env err m, MkCorpus c)
183 => Lang -> Username -> Either CorpusName [CorpusId] -> Maybe c -> [NodeId] -> m CorpusId
184 flowCorpusUser l userName corpusName ctype ids = do
185 -- User Flow
186 (userId, _rootId, userCorpusId) <- getOrMkRootWithCorpus userName corpusName ctype
187 -- TODO: check if present already, ignore
188 _ <- Doc.add userCorpusId ids
189 tId <- mkNode NodeTexts userCorpusId userId
190
191 printDebug "Node Text Id" tId
192
193 -- User List Flow
194 --{-
195 (_masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster (Left "") ctype
196 ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
197 userListId <- flowList userId userCorpusId ngs
198 printDebug "userListId" userListId
199 -- User Graph Flow
200 _ <- mkDashboard userCorpusId userId
201 _ <- mkGraph userCorpusId userId
202 _ <- mkPhylo userCorpusId userId
203 --}
204
205
206 -- Annuaire Flow
207 -- _ <- mkAnnuaire rootUserId userId
208 pure userCorpusId
209
210
211 insertMasterDocs :: ( FlowCmdM env err m
212 , FlowCorpus a
213 , MkCorpus c
214 )
215 => Maybe c -> TermType Lang -> [a] -> m [DocId]
216 insertMasterDocs c lang hs = do
217 (masterUserId, _, masterCorpusId) <- getOrMkRootWithCorpus userMaster (Left corpusMasterName) c
218
219 -- TODO Type NodeDocumentUnicised
220 let hs' = map addUniqId hs
221 ids <- insertDb masterUserId masterCorpusId hs'
222 let documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' hs')
223
224 let
225 fixLang (Unsupervised l n s m) = Unsupervised l n s m'
226 where
227 m' = case m of
228 Nothing -> trace ("buildTries here" :: String)
229 $ Just
230 $ buildTries n ( fmap toToken $ uniText
231 $ Text.intercalate " . "
232 $ List.concat
233 $ map hasText documentsWithId
234 )
235 just_m -> just_m
236 fixLang l = l
237
238 lang' = fixLang lang
239 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
240 maps <- mapNodeIdNgrams <$> documentIdWithNgrams (extractNgramsT lang') documentsWithId
241 terms2id <- insertNgrams $ Map.keys maps
242 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) maps
243
244 lId <- getOrMkList masterCorpusId masterUserId
245 _ <- insertDocNgrams lId indexedNgrams
246 pure $ map reId ids
247
248
249
250 type CorpusName = Text
251
252 getOrMkRootWithCorpus :: (HasNodeError err, MkCorpus a)
253 => Username -> Either CorpusName [CorpusId] -> Maybe a
254 -> Cmd err (UserId, RootId, CorpusId)
255 getOrMkRootWithCorpus username cName c = do
256 maybeUserId <- getUser username
257 userId <- case maybeUserId of
258 Nothing -> nodeError NoUserFound
259 Just user -> pure $ userLight_id user
260
261 rootId' <- map _node_id <$> getRoot username
262
263 rootId'' <- case rootId' of
264 [] -> mkRoot username userId
265 n -> case length n >= 2 of
266 True -> nodeError ManyNodeUsers
267 False -> pure rootId'
268
269 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
270
271 corpusId'' <- if username == userMaster
272 then do
273 ns <- getCorporaWithParentId rootId
274 pure $ map _node_id ns
275 else
276 pure $ fromRight [] cName
277
278 corpusId' <- if corpusId'' /= []
279 then pure corpusId''
280 else mk (Just $ fromLeft "Default" cName) c rootId userId
281
282 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
283
284 pure (userId, rootId, corpusId)
285
286
287 ------------------------------------------------------------------------
288
289
290 viewUniqId' :: UniqId a => a -> (HashId, a)
291 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
292 where
293 err = panic "[ERROR] Database.Flow.toInsert"
294
295
296 toInserted :: [ReturnId] -> Map HashId ReturnId
297 toInserted = Map.fromList . map (\r -> (reUniqId r, r) )
298 . filter (\r -> reInserted r == True)
299
300 data DocumentWithId a = DocumentWithId
301 { documentId :: !NodeId
302 , documentData :: !a
303 } deriving (Show)
304
305 instance HasText a => HasText (DocumentWithId a)
306 where
307 hasText (DocumentWithId _ a) = hasText a
308
309 mergeData :: Map HashId ReturnId
310 -> Map HashId a
311 -> [DocumentWithId a]
312 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
313 where
314 toDocumentWithId (hash,hpd) =
315 DocumentWithId <$> fmap reId (lookup hash rs)
316 <*> Just hpd
317
318 ------------------------------------------------------------------------
319 data DocumentIdWithNgrams a = DocumentIdWithNgrams
320 { documentWithId :: !(DocumentWithId a)
321 , document_ngrams :: !(Map Ngrams (Map NgramsType Int))
322 } deriving (Show)
323
324
325 instance HasText HyperdataContact
326 where
327 hasText = undefined
328
329 instance ExtractNgramsT HyperdataContact
330 where
331 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
332 where
333 extract :: TermType Lang -> HyperdataContact
334 -> Cmd err (Map Ngrams (Map NgramsType Int))
335 extract _l hc' = do
336 let authors = map text2ngrams
337 $ maybe ["Nothing"] (\a -> [a])
338 $ view (hc_who . _Just . cw_lastName) hc'
339
340 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
341
342 instance HasText HyperdataDocument
343 where
344 hasText h = catMaybes [ _hyperdataDocument_title h
345 , _hyperdataDocument_abstract h
346 ]
347
348 instance ExtractNgramsT HyperdataDocument
349 where
350 extractNgramsT :: TermType Lang -> HyperdataDocument -> Cmd err (Map Ngrams (Map NgramsType Int))
351 extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
352 where
353 extractNgramsT' :: TermType Lang -> HyperdataDocument
354 -> Cmd err (Map Ngrams (Map NgramsType Int))
355 extractNgramsT' lang' doc = do
356 let source = text2ngrams
357 $ maybe "Nothing" identity
358 $ _hyperdataDocument_source doc
359
360 institutes = map text2ngrams
361 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
362 $ _hyperdataDocument_institutes doc
363
364 authors = map text2ngrams
365 $ maybe ["Nothing"] (splitOn ", ")
366 $ _hyperdataDocument_authors doc
367
368 terms' <- map text2ngrams
369 <$> map (intercalate " " . _terms_label)
370 <$> concat
371 <$> liftIO (extractTerms lang' $ hasText doc)
372
373 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
374 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
375 <> [(a', Map.singleton Authors 1) | a' <- authors ]
376 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
377
378
379 filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
380 -> Map Ngrams (Map NgramsType Int)
381 filterNgramsT s ms = Map.fromList $ map (\a -> filter' s a) $ Map.toList ms
382 where
383 filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
384 True -> (ng,y)
385 False -> (Ngrams (Text.take s' t) n , y)
386
387
388 documentIdWithNgrams :: HasNodeError err
389 => (a
390 -> Cmd err (Map Ngrams (Map NgramsType Int)))
391 -> [DocumentWithId a]
392 -> Cmd err [DocumentIdWithNgrams a]
393 documentIdWithNgrams f = mapM toDocumentIdWithNgrams
394 where
395 toDocumentIdWithNgrams d = do
396 e <- f $ documentData d
397 pure $ DocumentIdWithNgrams d e
398
399
400 -- FLOW LIST
401 -- | TODO check optimization
402 mapNodeIdNgrams :: [DocumentIdWithNgrams a]
403 -> Map Ngrams (Map NgramsType (Map NodeId Int))
404 mapNodeIdNgrams = Map.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
405 where
406 f :: DocumentIdWithNgrams a
407 -> Map Ngrams (Map NgramsType (Map NodeId Int))
408 f d = fmap (fmap (Map.singleton nId)) $ document_ngrams d
409 where
410 nId = documentId $ documentWithId d
411
412 ------------------------------------------------------------------------
413 listInsert :: FlowCmdM env err m
414 => ListId -> Map NgramsType [NgramsElement]
415 -> m ()
416 listInsert lId ngs = mapM_ (\(typeList, ngElmts)
417 -> putListNgrams lId typeList ngElmts
418 ) $ toList ngs
419
420 flowList :: FlowCmdM env err m => UserId -> CorpusId
421 -> Map NgramsType [NgramsElement]
422 -> m ListId
423 flowList uId cId ngs = do
424 lId <- getOrMkList cId uId
425 printDebug "listId flowList" lId
426 listInsert lId ngs
427 --trace (show $ List.filter (\n -> _ne_ngrams n == "versatile") $ List.concat $ Map.elems ngs) $ listInsert lId ngs
428 pure lId
429