]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Flow.hs
[PHYLO] merge
[gargantext.git] / src / Gargantext / Database / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10
11 -- TODO-ACCESS:
12 -- check userId CanFillUserCorpus userCorpusId
13 -- check masterUserId CanFillMasterCorpus masterCorpusId
14
15 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
16 -- TODO-EVENTS: InsertedNodes
17
18
19 -}
20
21 {-# OPTIONS_GHC -fno-warn-orphans #-}
22
23 {-# LANGUAGE ConstraintKinds #-}
24 {-# LANGUAGE RankNTypes #-}
25 {-# LANGUAGE ConstrainedClassMethods #-}
26 {-# LANGUAGE ConstraintKinds #-}
27 {-# LANGUAGE DeriveGeneric #-}
28 {-# LANGUAGE FlexibleContexts #-}
29 {-# LANGUAGE InstanceSigs #-}
30 {-# LANGUAGE NoImplicitPrelude #-}
31 {-# LANGUAGE OverloadedStrings #-}
32
33 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
34 where
35 import Prelude (String)
36 import Debug.Trace (trace)
37 import Control.Lens ((^.), view, _Just)
38 import Control.Monad (mapM_)
39 import Control.Monad.IO.Class (liftIO)
40 import Data.List (concat)
41 import Data.Map (Map, lookup, toList)
42 import Data.Maybe (Maybe(..), catMaybes)
43 import Data.Monoid
44 import Data.Text (Text, splitOn, intercalate)
45 import GHC.Show (Show)
46 import Gargantext.API.Ngrams (HasRepoVar)
47 import Gargantext.API.Ngrams (NgramsElement(..), putListNgrams, RepoCmdM)
48 import Gargantext.Core (Lang(..))
49 import Gargantext.Core.Types (NodePoly(..), Terms(..))
50 import Gargantext.Core.Types.Individu (Username)
51 import Gargantext.Core.Flow
52 import Gargantext.Core.Types.Main
53 import Gargantext.Database.Config (userMaster, corpusMasterName)
54 import Gargantext.Database.Flow.Utils (insertDocNgrams)
55 import Gargantext.Database.Node.Contact -- (HyperdataContact(..), ContactWho(..))
56 import Gargantext.Database.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
57 import Gargantext.Database.Root (getRoot)
58 import Gargantext.Database.Schema.Ngrams -- (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
59 import Gargantext.Database.Schema.Node -- (mkRoot, mkCorpus, getOrMkList, mkGraph, mkPhylo, mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
60 import Gargantext.Database.Schema.User (getUser, UserLight(..))
61 import Gargantext.Database.TextSearch (searchInDatabase)
62 import Gargantext.Database.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
63 import Gargantext.Database.Utils (Cmd, CmdM)
64 import Gargantext.Ext.IMT (toSchoolName)
65 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
66 import Gargantext.Prelude
67 import Gargantext.Text.Terms.Eleve (buildTries, toToken)
68 import Gargantext.Text.List (buildNgramsLists,StopSize(..))
69 import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat)
70 import qualified Gargantext.Text.Corpus.API.Isidore as Isidore
71 import Gargantext.Text.Terms (TermType(..), tt_lang, extractTerms, uniText)
72 import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
73 import Servant (ServantErr)
74 import System.FilePath (FilePath)
75 import qualified Data.List as List
76 import qualified Data.Map as Map
77 import qualified Data.Text as Text
78 import qualified Gargantext.Database.Node.Document.Add as Doc (add)
79 import qualified Gargantext.Text.Corpus.Parsers.GrandDebat as GD
80
81 type FlowCmdM env err m =
82 ( CmdM env err m
83 , RepoCmdM env err m
84 , HasNodeError err
85 , HasRepoVar env
86 )
87
88 ------------------------------------------------------------------------
89
90 data ApiQuery = ApiIsidoreQuery Text | ApiIsidoreAuth Text
91 -- | APIs
92 -- TODO instances
93 getDataApi :: Lang
94 -> Maybe Limit
95 -> ApiQuery
96 -> IO [HyperdataDocument]
97 getDataApi lang limit (ApiIsidoreQuery q) = Isidore.get lang limit (Just q) Nothing
98 getDataApi lang limit (ApiIsidoreAuth q) = Isidore.get lang limit Nothing (Just q)
99
100
101 flowCorpusApi :: ( FlowCmdM env ServantErr m)
102 => Username -> CorpusName
103 -> TermType Lang
104 -> Maybe Limit
105 -> ApiQuery
106 -> m CorpusId
107 flowCorpusApi u n tt l q = do
108 docs <- liftIO $ splitEvery 500 <$> getDataApi (_tt_lang tt) l q
109 flowCorpus u n tt docs
110
111 ------------------------------------------------------------------------
112
113 flowAnnuaire :: FlowCmdM env ServantErr m
114 => Username -> CorpusName -> (TermType Lang) -> FilePath -> m AnnuaireId
115 flowAnnuaire u n l filePath = do
116 docs <- liftIO $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
117 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
118
119
120 flowCorpusDebat :: FlowCmdM env ServantErr m
121 => Username -> CorpusName
122 -> Limit -> FilePath
123 -> m CorpusId
124 flowCorpusDebat u n l fp = do
125 docs <- liftIO ( splitEvery 500
126 <$> take l
127 <$> GD.readFile fp
128 :: IO [[GD.GrandDebatReference ]]
129 )
130 flowCorpus u n (Multi FR) (map (map toHyperdataDocument) docs)
131
132 flowCorpusFile :: FlowCmdM env ServantErr m
133 => Username -> CorpusName
134 -> Limit -- Limit the number of docs (for dev purpose)
135 -> TermType Lang -> FileFormat -> FilePath
136 -> m CorpusId
137 flowCorpusFile u n l la ff fp = do
138 docs <- liftIO ( splitEvery 500
139 <$> take l
140 <$> parseFile ff fp
141 )
142 flowCorpus u n la (map (map toHyperdataDocument) docs)
143
144 -- TODO query with complex query
145 flowCorpusSearchInDatabase :: FlowCmdM env err m
146 => Username -> Lang -> Text -> m CorpusId
147 flowCorpusSearchInDatabase u la q = do
148 (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus userMaster "" (Nothing :: Maybe HyperdataCorpus)
149 ids <- map fst <$> searchInDatabase cId (stemIt q)
150 flowCorpusUser la u q (Nothing :: Maybe HyperdataCorpus) ids
151
152
153 flowCorpusSearchInDatabaseApi :: FlowCmdM env ServantErr m
154 => Username -> Lang -> Text -> m CorpusId
155 flowCorpusSearchInDatabaseApi u la q = do
156 (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus userMaster "" (Nothing :: Maybe HyperdataCorpus)
157 ids <- map fst <$> searchInDatabase cId (stemIt q)
158 flowCorpusUser la u q (Nothing :: Maybe HyperdataCorpus) ids
159
160 ------------------------------------------------------------------------
161 -- | TODO improve the needed type to create/update a corpus
162 data UserInfo = Username Text
163 | UserId NodeId
164 data CorpusInfo = CorpusName Lang Text
165 | CorpusId Lang NodeId
166
167
168 flow :: (FlowCmdM env err m, FlowCorpus a, MkCorpus c)
169 => Maybe c -> Username -> CorpusName -> TermType Lang -> [[a]] -> m CorpusId
170 flow c u cn la docs = do
171 ids <- mapM (insertMasterDocs c la ) docs
172 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
173
174 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
175 => Username -> CorpusName -> TermType Lang -> [[a]] -> m CorpusId
176 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
177
178 ------------------------------------------------------------------------
179
180
181 flowCorpusUser :: (FlowCmdM env err m, MkCorpus c)
182 => Lang -> Username -> CorpusName -> Maybe c -> [NodeId] -> m CorpusId
183 flowCorpusUser l userName corpusName ctype ids = do
184 -- User Flow
185 (userId, _rootId, userCorpusId) <- getOrMkRootWithCorpus userName corpusName ctype
186 -- TODO: check if present already, ignore
187 _ <- Doc.add userCorpusId ids
188
189 -- User List Flow
190 --{-
191 (_masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster "" ctype
192 ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
193 userListId <- flowList userId userCorpusId ngs
194 printDebug "userListId" userListId
195 -- User Graph Flow
196 _ <- mkGraph userCorpusId userId
197 _ <- mkPhylo userCorpusId userId
198 --}
199
200 -- User Dashboard Flow
201 _ <- mkDashboard userCorpusId userId
202
203 -- Annuaire Flow
204 -- _ <- mkAnnuaire rootUserId userId
205 pure userCorpusId
206
207
208 insertMasterDocs :: ( FlowCmdM env err m
209 , FlowCorpus a
210 , MkCorpus c
211 )
212 => Maybe c -> TermType Lang -> [a] -> m [DocId]
213 insertMasterDocs c lang hs = do
214 (masterUserId, _, masterCorpusId) <- getOrMkRootWithCorpus userMaster corpusMasterName c
215
216 -- TODO Type NodeDocumentUnicised
217 let hs' = map addUniqId hs
218 ids <- insertDb masterUserId masterCorpusId hs'
219 let documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' hs')
220
221 let
222 fixLang (Unsupervised l n s m) = Unsupervised l n s m'
223 where
224 m' = case m of
225 Nothing -> trace ("buildTries here" :: String)
226 $ Just
227 $ buildTries n ( fmap toToken $ uniText
228 $ Text.intercalate " . "
229 $ List.concat
230 $ map hasText documentsWithId
231 )
232 just_m -> just_m
233 fixLang l = l
234
235 lang' = fixLang lang
236 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
237 maps <- mapNodeIdNgrams <$> documentIdWithNgrams (extractNgramsT lang') documentsWithId
238 terms2id <- insertNgrams $ Map.keys maps
239 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) maps
240
241 lId <- getOrMkList masterCorpusId masterUserId
242 _ <- insertDocNgrams lId indexedNgrams
243 pure $ map reId ids
244
245
246
247 type CorpusName = Text
248
249 getOrMkRootWithCorpus :: (HasNodeError err, MkCorpus a)
250 => Username -> CorpusName -> Maybe a
251 -> Cmd err (UserId, RootId, CorpusId)
252 getOrMkRootWithCorpus username cName c = do
253 maybeUserId <- getUser username
254 userId <- case maybeUserId of
255 Nothing -> nodeError NoUserFound
256 Just user -> pure $ userLight_id user
257
258 rootId' <- map _node_id <$> getRoot username
259
260 rootId'' <- case rootId' of
261 [] -> mkRoot username userId
262 n -> case length n >= 2 of
263 True -> nodeError ManyNodeUsers
264 False -> pure rootId'
265
266 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
267
268 corpusId'' <- if username == userMaster
269 then do
270 ns <- getCorporaWithParentId rootId
271 pure $ map _node_id ns
272 else
273 pure []
274
275 corpusId' <- if corpusId'' /= []
276 then pure corpusId''
277 else mk (Just cName) c rootId userId
278
279 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
280
281 pure (userId, rootId, corpusId)
282
283
284 ------------------------------------------------------------------------
285
286
287 viewUniqId' :: UniqId a => a -> (HashId, a)
288 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
289 where
290 err = panic "[ERROR] Database.Flow.toInsert"
291
292
293 toInserted :: [ReturnId] -> Map HashId ReturnId
294 toInserted = Map.fromList . map (\r -> (reUniqId r, r) )
295 . filter (\r -> reInserted r == True)
296
297 data DocumentWithId a = DocumentWithId
298 { documentId :: !NodeId
299 , documentData :: !a
300 } deriving (Show)
301
302 instance HasText a => HasText (DocumentWithId a)
303 where
304 hasText (DocumentWithId _ a) = hasText a
305
306 mergeData :: Map HashId ReturnId
307 -> Map HashId a
308 -> [DocumentWithId a]
309 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
310 where
311 toDocumentWithId (hash,hpd) =
312 DocumentWithId <$> fmap reId (lookup hash rs)
313 <*> Just hpd
314
315 ------------------------------------------------------------------------
316 data DocumentIdWithNgrams a = DocumentIdWithNgrams
317 { documentWithId :: !(DocumentWithId a)
318 , document_ngrams :: !(Map Ngrams (Map NgramsType Int))
319 } deriving (Show)
320
321
322 instance HasText HyperdataContact
323 where
324 hasText = undefined
325
326 instance ExtractNgramsT HyperdataContact
327 where
328 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
329 where
330 extract :: TermType Lang -> HyperdataContact
331 -> Cmd err (Map Ngrams (Map NgramsType Int))
332 extract _l hc' = do
333 let authors = map text2ngrams
334 $ maybe ["Nothing"] (\a -> [a])
335 $ view (hc_who . _Just . cw_lastName) hc'
336
337 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
338
339 instance HasText HyperdataDocument
340 where
341 hasText h = catMaybes [ _hyperdataDocument_title h
342 , _hyperdataDocument_abstract h
343 ]
344
345 instance ExtractNgramsT HyperdataDocument
346 where
347 extractNgramsT :: TermType Lang -> HyperdataDocument -> Cmd err (Map Ngrams (Map NgramsType Int))
348 extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
349 where
350 extractNgramsT' :: TermType Lang -> HyperdataDocument
351 -> Cmd err (Map Ngrams (Map NgramsType Int))
352 extractNgramsT' lang' doc = do
353 let source = text2ngrams
354 $ maybe "Nothing" identity
355 $ _hyperdataDocument_source doc
356
357 institutes = map text2ngrams
358 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
359 $ _hyperdataDocument_institutes doc
360
361 authors = map text2ngrams
362 $ maybe ["Nothing"] (splitOn ", ")
363 $ _hyperdataDocument_authors doc
364
365 terms' <- map text2ngrams
366 <$> map (intercalate " " . _terms_label)
367 <$> concat
368 <$> liftIO (extractTerms lang' $ hasText doc)
369
370 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
371 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
372 <> [(a', Map.singleton Authors 1) | a' <- authors ]
373 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
374
375
376 filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
377 -> Map Ngrams (Map NgramsType Int)
378 filterNgramsT s ms = Map.fromList $ map (\a -> filter' s a) $ Map.toList ms
379 where
380 filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
381 True -> (ng,y)
382 False -> (Ngrams (Text.take s' t) n , y)
383
384
385 documentIdWithNgrams :: HasNodeError err
386 => (a
387 -> Cmd err (Map Ngrams (Map NgramsType Int)))
388 -> [DocumentWithId a]
389 -> Cmd err [DocumentIdWithNgrams a]
390 documentIdWithNgrams f = mapM toDocumentIdWithNgrams
391 where
392 toDocumentIdWithNgrams d = do
393 e <- f $ documentData d
394 pure $ DocumentIdWithNgrams d e
395
396
397 -- FLOW LIST
398 -- | TODO check optimization
399 mapNodeIdNgrams :: [DocumentIdWithNgrams a]
400 -> Map Ngrams (Map NgramsType (Map NodeId Int))
401 mapNodeIdNgrams = Map.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
402 where
403 f :: DocumentIdWithNgrams a
404 -> Map Ngrams (Map NgramsType (Map NodeId Int))
405 f d = fmap (fmap (Map.singleton nId)) $ document_ngrams d
406 where
407 nId = documentId $ documentWithId d
408
409 ------------------------------------------------------------------------
410 listInsert :: FlowCmdM env err m
411 => ListId -> Map NgramsType [NgramsElement]
412 -> m ()
413 listInsert lId ngs = mapM_ (\(typeList, ngElmts)
414 -> putListNgrams lId typeList ngElmts
415 ) $ toList ngs
416
417 flowList :: FlowCmdM env err m => UserId -> CorpusId
418 -> Map NgramsType [NgramsElement]
419 -> m ListId
420 flowList uId cId ngs = do
421 lId <- getOrMkList cId uId
422 printDebug "listId flowList" lId
423 listInsert lId ngs
424 --trace (show $ List.filter (\n -> _ne_ngrams n == "versatile") $ List.concat $ Map.elems ngs) $ listInsert lId ngs
425 pure lId
426