]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Flow.hs
[FIX] BUG limit on Nodes by Ngrams count.
[gargantext.git] / src / Gargantext / Database / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10
11 -- TODO-ACCESS:
12 -- check userId CanFillUserCorpus userCorpusId
13 -- check masterUserId CanFillMasterCorpus masterCorpusId
14
15 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
16 -- TODO-EVENTS: InsertedNodes
17
18
19 -}
20
21 {-# LANGUAGE ConstraintKinds #-}
22 {-# LANGUAGE DeriveGeneric #-}
23 {-# LANGUAGE NoImplicitPrelude #-}
24 {-# LANGUAGE OverloadedStrings #-}
25 {-# LANGUAGE RankNTypes #-}
26 {-# LANGUAGE FlexibleContexts #-}
27
28 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
29 where
30
31 --import Debug.Trace (trace)
32 import Control.Lens ((^.), view, Lens', _Just)
33 import Control.Monad (mapM_)
34 import Control.Monad.IO.Class (liftIO)
35 import Data.List (concat)
36 import Data.Map (Map, lookup, toList)
37 import Data.Maybe (Maybe(..), catMaybes)
38 import Data.Monoid
39 import Data.Text (Text, splitOn, intercalate)
40 import GHC.Show (Show)
41 import Gargantext.API.Ngrams (HasRepoVar)
42 import Gargantext.API.Ngrams (NgramsElement(..), putListNgrams, RepoCmdM)
43 import Gargantext.Core (Lang(..))
44 import Gargantext.Core.Types (NodePoly(..), Terms(..))
45 import Gargantext.Core.Types.Individu (Username)
46 import Gargantext.Core.Types.Main
47 import Gargantext.Database.Config (userMaster, corpusMasterName)
48 import Gargantext.Database.Flow.Utils (insertDocNgrams)
49 import Gargantext.Database.Node.Contact -- (HyperdataContact(..), ContactWho(..))
50 import Gargantext.Database.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
51 import Gargantext.Database.Root (getRoot)
52 import Gargantext.Database.Schema.Ngrams -- (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
53 import Gargantext.Database.Schema.Node -- (mkRoot, mkCorpus, getOrMkList, mkGraph, mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
54 import Gargantext.Database.Schema.User (getUser, UserLight(..))
55 import Gargantext.Database.TextSearch (searchInDatabase)
56 import Gargantext.Database.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
57 import Gargantext.Database.Utils (Cmd, CmdM)
58 import Gargantext.Ext.IMT (toSchoolName)
59 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
60 import Gargantext.Prelude
61 import Gargantext.Text.List (buildNgramsLists,StopSize(..))
62 import Gargantext.Text.Parsers (parseDocs, FileFormat)
63 import Gargantext.Text.Terms (TermType(..), tt_lang)
64 import Gargantext.Text.Terms (extractTerms)
65 import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
66 import Servant (ServantErr)
67 import System.FilePath (FilePath)
68 --import qualified Data.List as List
69 import qualified Data.Map as Map
70 import qualified Data.Text as Text
71 import qualified Gargantext.Database.Node.Document.Add as Doc (add)
72 import qualified Gargantext.Text.Parsers.GrandDebat as GD
73
74 type FlowCmdM env err m =
75 ( CmdM env err m
76 , RepoCmdM env err m
77 , HasNodeError err
78 , HasRepoVar env
79 )
80
81 type FlowCorpus a = ( AddUniqId a
82 , UniqId a
83 , InsertDb a
84 , ExtractNgramsT a
85 )
86
87 ------------------------------------------------------------------------
88
89 flowAnnuaire :: FlowCmdM env ServantErr m
90 => Username -> CorpusName -> (TermType Lang) -> FilePath -> m AnnuaireId
91 flowAnnuaire u n l filePath = do
92 docs <- liftIO $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
93 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
94
95
96 flowCorpusDebat :: FlowCmdM env ServantErr m
97 => Username -> CorpusName
98 -> Limit -> FilePath
99 -> m CorpusId
100 flowCorpusDebat u n l fp = do
101 docs <- liftIO ( splitEvery 500
102 <$> take l
103 <$> GD.readFile fp
104 :: IO [[GD.GrandDebatReference ]]
105 )
106 flowCorpus u n (Multi FR) (map (map toHyperdataDocument) docs)
107
108
109 flowCorpusFile :: FlowCmdM env ServantErr m
110 => Username -> CorpusName
111 -> Limit -- Limit the number of docs (for dev purpose)
112 -> TermType Lang -> FileFormat -> FilePath
113 -> m CorpusId
114 flowCorpusFile u n l la ff fp = do
115 docs <- liftIO ( splitEvery 500
116 <$> take l
117 <$> parseDocs ff fp
118 )
119 flowCorpus u n la (map (map toHyperdataDocument) docs)
120
121 -- TODO query with complex query
122 flowCorpusSearchInDatabase :: FlowCmdM env ServantErr m
123 => Username -> Lang -> Text -> m CorpusId
124 flowCorpusSearchInDatabase u la q = do
125 (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus userMaster "" (Nothing :: Maybe HyperdataCorpus)
126 ids <- map fst <$> searchInDatabase cId (stemIt q)
127 flowCorpusUser la u q (Nothing :: Maybe HyperdataCorpus) ids
128
129 ------------------------------------------------------------------------
130
131 flow :: (FlowCmdM env ServantErr m, FlowCorpus a, MkCorpus c)
132 => Maybe c -> Username -> CorpusName -> TermType Lang -> [[a]] -> m CorpusId
133 flow c u cn la docs = do
134 ids <- mapM (insertMasterDocs c la ) docs
135 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
136
137 flowCorpus :: (FlowCmdM env ServantErr m, FlowCorpus a)
138 => Username -> CorpusName -> TermType Lang -> [[a]] -> m CorpusId
139 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
140
141
142 flowCorpusUser :: (FlowCmdM env ServantErr m, MkCorpus c)
143 => Lang -> Username -> CorpusName -> Maybe c -> [NodeId] -> m CorpusId
144 flowCorpusUser l userName corpusName ctype ids = do
145 -- User Flow
146 (userId, _rootId, userCorpusId) <- getOrMkRootWithCorpus userName corpusName ctype
147 -- TODO: check if present already, ignore
148 _ <- Doc.add userCorpusId ids
149
150 -- User List Flow
151 --{-
152 (_masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster "" ctype
153 ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
154 userListId <- flowList userId userCorpusId ngs
155 printDebug "userListId" userListId
156 -- User Graph Flow
157 _ <- mkGraph userCorpusId userId
158 --}
159
160 -- User Dashboard Flow
161 _ <- mkDashboard userCorpusId userId
162
163 -- Annuaire Flow
164 -- _ <- mkAnnuaire rootUserId userId
165 pure userCorpusId
166
167
168 insertMasterDocs :: ( FlowCmdM env ServantErr m
169 , FlowCorpus a
170 , MkCorpus c
171 )
172 => Maybe c -> TermType Lang -> [a] -> m [DocId]
173 insertMasterDocs c lang hs = do
174 (masterUserId, _, masterCorpusId) <- getOrMkRootWithCorpus userMaster corpusMasterName c
175
176 -- TODO Type NodeDocumentUnicised
177 let hs' = map addUniqId hs
178 ids <- insertDb masterUserId masterCorpusId hs'
179 let documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' hs')
180
181 maps <- mapNodeIdNgrams <$> documentIdWithNgrams (extractNgramsT lang) documentsWithId
182 terms2id <- insertNgrams $ Map.keys maps
183 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) maps
184
185 lId <- getOrMkList masterCorpusId masterUserId
186 _ <- insertDocNgrams lId indexedNgrams
187 pure $ map reId ids
188
189
190
191 type CorpusName = Text
192
193 getOrMkRootWithCorpus :: (HasNodeError err, MkCorpus a)
194 => Username -> CorpusName -> Maybe a
195 -> Cmd err (UserId, RootId, CorpusId)
196 getOrMkRootWithCorpus username cName c = do
197 maybeUserId <- getUser username
198 userId <- case maybeUserId of
199 Nothing -> nodeError NoUserFound
200 Just user -> pure $ userLight_id user
201
202 rootId' <- map _node_id <$> getRoot username
203
204 rootId'' <- case rootId' of
205 [] -> mkRoot username userId
206 n -> case length n >= 2 of
207 True -> nodeError ManyNodeUsers
208 False -> pure rootId'
209
210 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
211
212 corpusId'' <- if username == userMaster
213 then do
214 ns <- getCorporaWithParentId rootId
215 pure $ map _node_id ns
216 else
217 pure []
218
219 corpusId' <- if corpusId'' /= []
220 then pure corpusId''
221 else mk (Just cName) c rootId userId
222
223 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
224
225 pure (userId, rootId, corpusId)
226
227
228 ------------------------------------------------------------------------
229
230
231 class UniqId a
232 where
233 uniqId :: Lens' a (Maybe HashId)
234
235
236 instance UniqId HyperdataDocument
237 where
238 uniqId = hyperdataDocument_uniqId
239
240 instance UniqId HyperdataContact
241 where
242 uniqId = hc_uniqId
243
244 viewUniqId' :: UniqId a => a -> (HashId, a)
245 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
246 where
247 err = panic "[ERROR] Database.Flow.toInsert"
248
249
250 toInserted :: [ReturnId] -> Map HashId ReturnId
251 toInserted = Map.fromList . map (\r -> (reUniqId r, r) )
252 . filter (\r -> reInserted r == True)
253
254 data DocumentWithId a = DocumentWithId
255 { documentId :: !NodeId
256 , documentData :: !a
257 } deriving (Show)
258
259 mergeData :: Map HashId ReturnId
260 -> Map HashId a
261 -> [DocumentWithId a]
262 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
263 where
264 toDocumentWithId (hash,hpd) =
265 DocumentWithId <$> fmap reId (lookup hash rs)
266 <*> Just hpd
267
268 ------------------------------------------------------------------------
269 data DocumentIdWithNgrams a = DocumentIdWithNgrams
270 { documentWithId :: !(DocumentWithId a)
271 , document_ngrams :: !(Map Ngrams (Map NgramsType Int))
272 } deriving (Show)
273
274 -- TODO extractNgrams according to Type of Data
275
276 class ExtractNgramsT h
277 where
278 extractNgramsT :: TermType Lang -> h -> Cmd err (Map Ngrams (Map NgramsType Int))
279
280
281 instance ExtractNgramsT HyperdataContact
282 where
283 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
284 where
285 extract :: TermType Lang -> HyperdataContact
286 -> Cmd err (Map Ngrams (Map NgramsType Int))
287 extract _l hc' = do
288 let authors = map text2ngrams
289 $ maybe ["Nothing"] (\a -> [a])
290 $ view (hc_who . _Just . cw_lastName) hc'
291
292 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
293
294
295
296
297 instance ExtractNgramsT HyperdataDocument
298 where
299 extractNgramsT = extractNgramsT'
300
301 extractNgramsT' :: TermType Lang -> HyperdataDocument
302 -> Cmd err (Map Ngrams (Map NgramsType Int))
303 extractNgramsT' lang hd = filterNgramsT 255 <$> extractNgramsT'' lang hd
304 where
305 extractNgramsT'' :: TermType Lang -> HyperdataDocument
306 -> Cmd err (Map Ngrams (Map NgramsType Int))
307 extractNgramsT'' lang' doc = do
308 let source = text2ngrams
309 $ maybe "Nothing" identity
310 $ _hyperdataDocument_source doc
311
312 institutes = map text2ngrams
313 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
314 $ _hyperdataDocument_institutes doc
315
316 authors = map text2ngrams
317 $ maybe ["Nothing"] (splitOn ", ")
318 $ _hyperdataDocument_authors doc
319
320 leText = catMaybes [ _hyperdataDocument_title doc
321 , _hyperdataDocument_abstract doc
322 ]
323
324 terms' <- map text2ngrams
325 <$> map (intercalate " " . _terms_label)
326 <$> concat
327 <$> liftIO (extractTerms lang' leText)
328
329 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
330 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
331 <> [(a', Map.singleton Authors 1) | a' <- authors ]
332 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
333
334
335 filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
336 -> Map Ngrams (Map NgramsType Int)
337 filterNgramsT s ms = Map.fromList $ map (\a -> filter' s a) $ Map.toList ms
338 where
339 filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
340 True -> (ng,y)
341 False -> (Ngrams (Text.take s' t) n , y)
342
343
344 documentIdWithNgrams :: HasNodeError err
345 => (a
346 -> Cmd err (Map Ngrams (Map NgramsType Int)))
347 -> [DocumentWithId a]
348 -> Cmd err [DocumentIdWithNgrams a]
349 documentIdWithNgrams f = mapM toDocumentIdWithNgrams
350 where
351 toDocumentIdWithNgrams d = do
352 e <- f $ documentData d
353 pure $ DocumentIdWithNgrams d e
354
355
356
357 -- FLOW LIST
358 -- | TODO check optimization
359 mapNodeIdNgrams :: [DocumentIdWithNgrams a]
360 -> Map Ngrams (Map NgramsType (Map NodeId Int))
361 mapNodeIdNgrams = Map.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
362 where
363 f :: DocumentIdWithNgrams a
364 -> Map Ngrams (Map NgramsType (Map NodeId Int))
365 f d = fmap (fmap (Map.singleton nId)) $ document_ngrams d
366 where
367 nId = documentId $ documentWithId d
368
369 ------------------------------------------------------------------------
370 listInsert :: FlowCmdM env err m
371 => ListId -> Map NgramsType [NgramsElement]
372 -> m ()
373 listInsert lId ngs = mapM_ (\(typeList, ngElmts)
374 -> putListNgrams lId typeList ngElmts
375 ) $ toList ngs
376
377 flowList :: FlowCmdM env err m => UserId -> CorpusId
378 -> Map NgramsType [NgramsElement]
379 -> m ListId
380 flowList uId cId ngs = do
381 lId <- getOrMkList cId uId
382 printDebug "listId flowList" lId
383 listInsert lId ngs
384 --trace (show $ List.filter (\n -> _ne_ngrams n == "versatile") $ List.concat $ Map.elems ngs) $ listInsert lId ngs
385 pure lId
386