]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Flow.hs
[MERGE][GRAPH FIX] edges weight + confluence in graph.
[gargantext.git] / src / Gargantext / Database / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10
11 -- TODO-ACCESS:
12 -- check userId CanFillUserCorpus userCorpusId
13 -- check masterUserId CanFillMasterCorpus masterCorpusId
14
15 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
16 -- TODO-EVENTS: InsertedNodes
17
18
19 -}
20
21 {-# LANGUAGE ConstraintKinds #-}
22 {-# LANGUAGE DeriveGeneric #-}
23 {-# LANGUAGE NoImplicitPrelude #-}
24 {-# LANGUAGE OverloadedStrings #-}
25 {-# LANGUAGE RankNTypes #-}
26 {-# LANGUAGE FlexibleContexts #-}
27
28 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
29 where
30
31 --import Debug.Trace (trace)
32 import Control.Lens ((^.), view, Lens', _Just)
33 import Control.Monad (mapM_)
34 import Control.Monad.IO.Class (liftIO)
35 import Data.List (concat)
36 import Data.Map (Map, lookup, toList)
37 import Data.Maybe (Maybe(..), catMaybes)
38 import Data.Monoid
39 import Data.Text (Text, splitOn, intercalate)
40 import GHC.Show (Show)
41 import Gargantext.API.Ngrams (HasRepoVar)
42 import Gargantext.API.Ngrams (NgramsElement(..), putListNgrams, RepoCmdM)
43 import Gargantext.Core (Lang(..))
44 import Gargantext.Core.Types (NodePoly(..), Terms(..))
45 import Gargantext.Core.Types.Individu (Username)
46 import Gargantext.Core.Types.Main
47 import Gargantext.Database.Config (userMaster, corpusMasterName)
48 import Gargantext.Database.Flow.Utils (insertDocNgrams)
49 import Gargantext.Database.Node.Contact -- (HyperdataContact(..), ContactWho(..))
50 import Gargantext.Database.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
51 import Gargantext.Database.Root (getRoot)
52 import Gargantext.Database.Schema.Ngrams -- (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
53 import Gargantext.Database.Schema.Node -- (mkRoot, mkCorpus, getOrMkList, mkGraph, mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
54 import Gargantext.Database.Schema.User (getUser, UserLight(..))
55 import Gargantext.Database.TextSearch (searchInDatabase)
56 import Gargantext.Database.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
57 import Gargantext.Database.Utils (Cmd, CmdM)
58 import Gargantext.Ext.IMT (toSchoolName)
59 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
60 import Gargantext.Prelude
61 import Gargantext.Text.List (buildNgramsLists,StopSize(..))
62 import Gargantext.Text.Parsers (parseFile, FileFormat)
63 import Gargantext.Text.Terms (TermType(..), tt_lang)
64 import Gargantext.Text.Terms (extractTerms)
65 import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
66 import Servant (ServantErr)
67 import System.FilePath (FilePath)
68 --import qualified Data.List as List
69 import qualified Data.Map as Map
70 import qualified Data.Text as Text
71 import qualified Gargantext.Database.Node.Document.Add as Doc (add)
72 import qualified Gargantext.Text.Parsers.GrandDebat as GD
73
74 type FlowCmdM env err m =
75 ( CmdM env err m
76 , RepoCmdM env err m
77 , HasNodeError err
78 , HasRepoVar env
79 )
80
81 type FlowCorpus a = ( AddUniqId a
82 , UniqId a
83 , InsertDb a
84 , ExtractNgramsT a
85 )
86
87 ------------------------------------------------------------------------
88
89 flowAnnuaire :: FlowCmdM env ServantErr m
90 => Username -> CorpusName -> (TermType Lang) -> FilePath -> m AnnuaireId
91 flowAnnuaire u n l filePath = do
92 docs <- liftIO $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
93 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
94
95
96 flowCorpusDebat :: FlowCmdM env ServantErr m
97 => Username -> CorpusName
98 -> Limit -> FilePath
99 -> m CorpusId
100 flowCorpusDebat u n l fp = do
101 docs <- liftIO ( splitEvery 500
102 <$> take l
103 <$> GD.readFile fp
104 :: IO [[GD.GrandDebatReference ]]
105 )
106 flowCorpus u n (Multi FR) (map (map toHyperdataDocument) docs)
107
108
109 flowCorpusFile :: FlowCmdM env ServantErr m
110 => Username -> CorpusName
111 -> Limit -- Limit the number of docs (for dev purpose)
112 -> TermType Lang -> FileFormat -> FilePath
113 -> m CorpusId
114 flowCorpusFile u n l la ff fp = do
115 docs <- liftIO ( splitEvery 500
116 <$> take l
117 <$> parseFile ff fp
118 )
119 flowCorpus u n la (map (map toHyperdataDocument) docs)
120
121 -- TODO query with complex query
122 flowCorpusSearchInDatabase :: FlowCmdM env err m
123 => Username -> Lang -> Text -> m CorpusId
124 flowCorpusSearchInDatabase u la q = do
125 (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus userMaster "" (Nothing :: Maybe HyperdataCorpus)
126 ids <- map fst <$> searchInDatabase cId (stemIt q)
127 flowCorpusUser la u q (Nothing :: Maybe HyperdataCorpus) ids
128
129
130 flowCorpusSearchInDatabase' :: FlowCmdM env ServantErr m
131 => Username -> Lang -> Text -> m CorpusId
132 flowCorpusSearchInDatabase' u la q = do
133 (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus userMaster "" (Nothing :: Maybe HyperdataCorpus)
134 ids <- map fst <$> searchInDatabase cId (stemIt q)
135 flowCorpusUser la u q (Nothing :: Maybe HyperdataCorpus) ids
136
137 ------------------------------------------------------------------------
138
139 flow :: (FlowCmdM env ServantErr m, FlowCorpus a, MkCorpus c)
140 => Maybe c -> Username -> CorpusName -> TermType Lang -> [[a]] -> m CorpusId
141 flow c u cn la docs = do
142 ids <- mapM (insertMasterDocs c la ) docs
143 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
144
145 flowCorpus :: (FlowCmdM env ServantErr m, FlowCorpus a)
146 => Username -> CorpusName -> TermType Lang -> [[a]] -> m CorpusId
147 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
148
149
150 flowCorpusUser :: (FlowCmdM env err m, MkCorpus c)
151 => Lang -> Username -> CorpusName -> Maybe c -> [NodeId] -> m CorpusId
152 flowCorpusUser l userName corpusName ctype ids = do
153 -- User Flow
154 (userId, _rootId, userCorpusId) <- getOrMkRootWithCorpus userName corpusName ctype
155 -- TODO: check if present already, ignore
156 _ <- Doc.add userCorpusId ids
157
158 -- User List Flow
159 --{-
160 (_masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster "" ctype
161 ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
162 userListId <- flowList userId userCorpusId ngs
163 printDebug "userListId" userListId
164 -- User Graph Flow
165 _ <- mkGraph userCorpusId userId
166 --}
167
168 -- User Dashboard Flow
169 _ <- mkDashboard userCorpusId userId
170
171 -- Annuaire Flow
172 -- _ <- mkAnnuaire rootUserId userId
173 pure userCorpusId
174
175
176 insertMasterDocs :: ( FlowCmdM env ServantErr m
177 , FlowCorpus a
178 , MkCorpus c
179 )
180 => Maybe c -> TermType Lang -> [a] -> m [DocId]
181 insertMasterDocs c lang hs = do
182 (masterUserId, _, masterCorpusId) <- getOrMkRootWithCorpus userMaster corpusMasterName c
183
184 -- TODO Type NodeDocumentUnicised
185 let hs' = map addUniqId hs
186 ids <- insertDb masterUserId masterCorpusId hs'
187 let documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' hs')
188
189 maps <- mapNodeIdNgrams <$> documentIdWithNgrams (extractNgramsT lang) documentsWithId
190 terms2id <- insertNgrams $ Map.keys maps
191 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) maps
192
193 lId <- getOrMkList masterCorpusId masterUserId
194 _ <- insertDocNgrams lId indexedNgrams
195 pure $ map reId ids
196
197
198
199 type CorpusName = Text
200
201 getOrMkRootWithCorpus :: (HasNodeError err, MkCorpus a)
202 => Username -> CorpusName -> Maybe a
203 -> Cmd err (UserId, RootId, CorpusId)
204 getOrMkRootWithCorpus username cName c = do
205 maybeUserId <- getUser username
206 userId <- case maybeUserId of
207 Nothing -> nodeError NoUserFound
208 Just user -> pure $ userLight_id user
209
210 rootId' <- map _node_id <$> getRoot username
211
212 rootId'' <- case rootId' of
213 [] -> mkRoot username userId
214 n -> case length n >= 2 of
215 True -> nodeError ManyNodeUsers
216 False -> pure rootId'
217
218 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
219
220 corpusId'' <- if username == userMaster
221 then do
222 ns <- getCorporaWithParentId rootId
223 pure $ map _node_id ns
224 else
225 pure []
226
227 corpusId' <- if corpusId'' /= []
228 then pure corpusId''
229 else mk (Just cName) c rootId userId
230
231 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
232
233 pure (userId, rootId, corpusId)
234
235
236 ------------------------------------------------------------------------
237
238
239 class UniqId a
240 where
241 uniqId :: Lens' a (Maybe HashId)
242
243
244 instance UniqId HyperdataDocument
245 where
246 uniqId = hyperdataDocument_uniqId
247
248 instance UniqId HyperdataContact
249 where
250 uniqId = hc_uniqId
251
252 viewUniqId' :: UniqId a => a -> (HashId, a)
253 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
254 where
255 err = panic "[ERROR] Database.Flow.toInsert"
256
257
258 toInserted :: [ReturnId] -> Map HashId ReturnId
259 toInserted = Map.fromList . map (\r -> (reUniqId r, r) )
260 . filter (\r -> reInserted r == True)
261
262 data DocumentWithId a = DocumentWithId
263 { documentId :: !NodeId
264 , documentData :: !a
265 } deriving (Show)
266
267 mergeData :: Map HashId ReturnId
268 -> Map HashId a
269 -> [DocumentWithId a]
270 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
271 where
272 toDocumentWithId (hash,hpd) =
273 DocumentWithId <$> fmap reId (lookup hash rs)
274 <*> Just hpd
275
276 ------------------------------------------------------------------------
277 data DocumentIdWithNgrams a = DocumentIdWithNgrams
278 { documentWithId :: !(DocumentWithId a)
279 , document_ngrams :: !(Map Ngrams (Map NgramsType Int))
280 } deriving (Show)
281
282 -- TODO extractNgrams according to Type of Data
283
284 class ExtractNgramsT h
285 where
286 extractNgramsT :: TermType Lang -> h -> Cmd err (Map Ngrams (Map NgramsType Int))
287
288
289 instance ExtractNgramsT HyperdataContact
290 where
291 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
292 where
293 extract :: TermType Lang -> HyperdataContact
294 -> Cmd err (Map Ngrams (Map NgramsType Int))
295 extract _l hc' = do
296 let authors = map text2ngrams
297 $ maybe ["Nothing"] (\a -> [a])
298 $ view (hc_who . _Just . cw_lastName) hc'
299
300 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
301
302
303
304
305 instance ExtractNgramsT HyperdataDocument
306 where
307 extractNgramsT = extractNgramsT'
308
309 extractNgramsT' :: TermType Lang -> HyperdataDocument
310 -> Cmd err (Map Ngrams (Map NgramsType Int))
311 extractNgramsT' lang hd = filterNgramsT 255 <$> extractNgramsT'' lang hd
312 where
313 extractNgramsT'' :: TermType Lang -> HyperdataDocument
314 -> Cmd err (Map Ngrams (Map NgramsType Int))
315 extractNgramsT'' lang' doc = do
316 let source = text2ngrams
317 $ maybe "Nothing" identity
318 $ _hyperdataDocument_source doc
319
320 institutes = map text2ngrams
321 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
322 $ _hyperdataDocument_institutes doc
323
324 authors = map text2ngrams
325 $ maybe ["Nothing"] (splitOn ", ")
326 $ _hyperdataDocument_authors doc
327
328 leText = catMaybes [ _hyperdataDocument_title doc
329 , _hyperdataDocument_abstract doc
330 ]
331
332 terms' <- map text2ngrams
333 <$> map (intercalate " " . _terms_label)
334 <$> concat
335 <$> liftIO (extractTerms lang' leText)
336
337 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
338 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
339 <> [(a', Map.singleton Authors 1) | a' <- authors ]
340 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
341
342
343 filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
344 -> Map Ngrams (Map NgramsType Int)
345 filterNgramsT s ms = Map.fromList $ map (\a -> filter' s a) $ Map.toList ms
346 where
347 filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
348 True -> (ng,y)
349 False -> (Ngrams (Text.take s' t) n , y)
350
351
352 documentIdWithNgrams :: HasNodeError err
353 => (a
354 -> Cmd err (Map Ngrams (Map NgramsType Int)))
355 -> [DocumentWithId a]
356 -> Cmd err [DocumentIdWithNgrams a]
357 documentIdWithNgrams f = mapM toDocumentIdWithNgrams
358 where
359 toDocumentIdWithNgrams d = do
360 e <- f $ documentData d
361 pure $ DocumentIdWithNgrams d e
362
363
364
365 -- FLOW LIST
366 -- | TODO check optimization
367 mapNodeIdNgrams :: [DocumentIdWithNgrams a]
368 -> Map Ngrams (Map NgramsType (Map NodeId Int))
369 mapNodeIdNgrams = Map.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
370 where
371 f :: DocumentIdWithNgrams a
372 -> Map Ngrams (Map NgramsType (Map NodeId Int))
373 f d = fmap (fmap (Map.singleton nId)) $ document_ngrams d
374 where
375 nId = documentId $ documentWithId d
376
377 ------------------------------------------------------------------------
378 listInsert :: FlowCmdM env err m
379 => ListId -> Map NgramsType [NgramsElement]
380 -> m ()
381 listInsert lId ngs = mapM_ (\(typeList, ngElmts)
382 -> putListNgrams lId typeList ngElmts
383 ) $ toList ngs
384
385 flowList :: FlowCmdM env err m => UserId -> CorpusId
386 -> Map NgramsType [NgramsElement]
387 -> m ListId
388 flowList uId cId ngs = do
389 lId <- getOrMkList cId uId
390 printDebug "listId flowList" lId
391 listInsert lId ngs
392 --trace (show $ List.filter (\n -> _ne_ngrams n == "versatile") $ List.concat $ Map.elems ngs) $ listInsert lId ngs
393 pure lId
394