]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Flow.hs
[NodeNodeNgram] First Node is now NodeListId
[gargantext.git] / src / Gargantext / Database / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10
11 -- TODO-ACCESS:
12 -- check userId CanFillUserCorpus userCorpusId
13 -- check masterUserId CanFillMasterCorpus masterCorpusId
14
15 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
16 -- TODO-EVENTS: InsertedNodes
17
18
19 -}
20
21 {-# LANGUAGE ConstraintKinds #-}
22 {-# LANGUAGE DeriveGeneric #-}
23 {-# LANGUAGE NoImplicitPrelude #-}
24 {-# LANGUAGE OverloadedStrings #-}
25 {-# LANGUAGE RankNTypes #-}
26 {-# LANGUAGE FlexibleContexts #-}
27
28 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
29 where
30
31 import Control.Lens ((^.), view, Lens', _Just)
32 import Control.Monad (mapM_)
33 import Control.Monad.IO.Class (liftIO)
34 import Data.List (concat)
35 import Data.Map (Map, lookup, toList)
36 import Data.Maybe (Maybe(..), catMaybes)
37 import Data.Monoid
38 import Data.Text (Text, splitOn, intercalate)
39 import GHC.Show (Show)
40 import Gargantext.API.Ngrams (HasRepoVar)
41 import Gargantext.API.Ngrams (NgramsElement, putListNgrams, RepoCmdM)
42 import Gargantext.Core (Lang(..))
43 import Gargantext.Core.Types (NodePoly(..), Terms(..))
44 import Gargantext.Core.Types.Individu (Username)
45 import Gargantext.Core.Types.Main
46 import Gargantext.Database.Config (userMaster, corpusMasterName)
47 import Gargantext.Database.Flow.Utils (insertDocNgrams)
48 import Gargantext.Database.Node.Contact -- (HyperdataContact(..), ContactWho(..))
49 import Gargantext.Database.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
50 import Gargantext.Database.Root (getRoot)
51 import Gargantext.Database.Schema.Ngrams -- (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
52 import Gargantext.Database.Schema.Node -- (mkRoot, mkCorpus, getOrMkList, mkGraph, mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
53 import Gargantext.Database.Schema.User (getUser, UserLight(..))
54 import Gargantext.Database.TextSearch (searchInDatabase)
55 import Gargantext.Database.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
56 import Gargantext.Database.Utils (Cmd, CmdM)
57 import Gargantext.Ext.IMT (toSchoolName)
58 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
59 import Gargantext.Prelude
60 import Gargantext.Text.List (buildNgramsLists,StopSize(..))
61 import Gargantext.Text.Parsers (parseDocs, FileFormat)
62 import Gargantext.Text.Terms (TermType(..), tt_lang)
63 import Gargantext.Text.Terms (extractTerms)
64 import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
65 import Servant (ServantErr)
66 import System.FilePath (FilePath)
67 import qualified Data.Map as DM
68 import qualified Data.Text as Text
69 import qualified Gargantext.Database.Node.Document.Add as Doc (add)
70 import qualified Gargantext.Text.Parsers.GrandDebat as GD
71
72 type FlowCmdM env err m =
73 ( CmdM env err m
74 , RepoCmdM env err m
75 , HasNodeError err
76 , HasRepoVar env
77 )
78
79 type FlowCorpus a = ( AddUniqId a
80 , UniqId a
81 , InsertDb a
82 , ExtractNgramsT a
83 )
84
85 ------------------------------------------------------------------------
86
87 flowAnnuaire :: FlowCmdM env ServantErr m
88 => Username -> CorpusName -> (TermType Lang) -> FilePath -> m AnnuaireId
89 flowAnnuaire u n l filePath = do
90 docs <- liftIO $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
91 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
92
93
94 flowCorpusDebat :: FlowCmdM env ServantErr m
95 => Username -> CorpusName
96 -> Limit -> FilePath
97 -> m CorpusId
98 flowCorpusDebat u n l fp = do
99 docs <- liftIO ( splitEvery 500
100 <$> take l
101 <$> GD.readFile fp
102 :: IO [[GD.GrandDebatReference ]]
103 )
104 flowCorpus u n (Multi FR) (map (map toHyperdataDocument) docs)
105
106
107 flowCorpusFile :: FlowCmdM env ServantErr m
108 => Username -> CorpusName
109 -> Limit -- ^ Limit the number of docs (for dev purpose)
110 -> TermType Lang -> FileFormat -> FilePath
111 -> m CorpusId
112 flowCorpusFile u n l la ff fp = do
113 docs <- liftIO ( splitEvery 500
114 <$> take l
115 <$> parseDocs ff fp
116 )
117 flowCorpus u n la (map (map toHyperdataDocument) docs)
118
119 -- TODO query with complex query
120 flowCorpusSearchInDatabase :: FlowCmdM env ServantErr m
121 => Username -> Lang -> Text -> m CorpusId
122 flowCorpusSearchInDatabase u la q = do
123 (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus userMaster "" (Nothing :: Maybe HyperdataCorpus)
124 ids <- map fst <$> searchInDatabase cId (stemIt q)
125 flowCorpusUser la u q (Nothing :: Maybe HyperdataCorpus) ids
126
127 ------------------------------------------------------------------------
128
129
130 flow :: (FlowCmdM env ServantErr m, FlowCorpus a, MkCorpus c)
131 => Maybe c -> Username -> CorpusName -> TermType Lang -> [[a]] -> m CorpusId
132 flow c u cn la docs = do
133 ids <- mapM (insertMasterDocs c la ) docs
134 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
135
136 flowCorpus :: (FlowCmdM env ServantErr m, FlowCorpus a)
137 => Username -> CorpusName -> TermType Lang -> [[a]] -> m CorpusId
138 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
139
140
141 flowCorpusUser :: (FlowCmdM env ServantErr m, MkCorpus c)
142 => Lang -> Username -> CorpusName -> Maybe c -> [NodeId] -> m CorpusId
143 flowCorpusUser l userName corpusName ctype ids = do
144 -- User Flow
145 (userId, _rootId, userCorpusId) <- getOrMkRootWithCorpus userName corpusName ctype
146 -- TODO: check if present already, ignore
147 _ <- Doc.add userCorpusId ids
148
149 -- User List Flow
150 --{-
151 (_masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster "" ctype
152 ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
153 userListId <- flowList userId userCorpusId ngs
154 printDebug "userListId" userListId
155 -- User Graph Flow
156 _ <- mkGraph userCorpusId userId
157 --}
158
159 -- User Dashboard Flow
160 _ <- mkDashboard userCorpusId userId
161
162 -- Annuaire Flow
163 -- _ <- mkAnnuaire rootUserId userId
164 pure userCorpusId
165
166
167 insertMasterDocs :: ( FlowCmdM env ServantErr m
168 , FlowCorpus a
169 , MkCorpus c
170 )
171 => Maybe c -> TermType Lang -> [a] -> m [DocId]
172 insertMasterDocs c lang hs = do
173 (masterUserId, _, masterCorpusId) <- getOrMkRootWithCorpus userMaster corpusMasterName c
174
175 -- TODO Type NodeDocumentUnicised
176 let hs' = map addUniqId hs
177 ids <- insertDb masterUserId masterCorpusId hs'
178 let documentsWithId = mergeData (toInserted ids) (DM.fromList $ map viewUniqId' hs')
179
180 docsWithNgrams <- documentIdWithNgrams (extractNgramsT lang) documentsWithId
181
182 let maps = mapNodeIdNgrams docsWithNgrams
183
184 terms2id <- insertNgrams $ DM.keys maps
185 let indexedNgrams = DM.mapKeys (indexNgrams terms2id) maps
186
187 lId <- getOrMkList masterCorpusId masterUserId
188 _ <- insertDocNgrams lId indexedNgrams
189 pure $ map reId ids
190
191
192
193 type CorpusName = Text
194
195 getOrMkRootWithCorpus :: (HasNodeError err, MkCorpus a)
196 => Username -> CorpusName -> Maybe a
197 -> Cmd err (UserId, RootId, CorpusId)
198 getOrMkRootWithCorpus username cName c = do
199 maybeUserId <- getUser username
200 userId <- case maybeUserId of
201 Nothing -> nodeError NoUserFound
202 Just user -> pure $ userLight_id user
203
204 rootId' <- map _node_id <$> getRoot username
205
206 rootId'' <- case rootId' of
207 [] -> mkRoot username userId
208 n -> case length n >= 2 of
209 True -> nodeError ManyNodeUsers
210 False -> pure rootId'
211
212 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
213
214 corpusId'' <- if username == userMaster
215 then do
216 ns <- getCorporaWithParentId rootId
217 pure $ map _node_id ns
218 else
219 pure []
220
221 corpusId' <- if corpusId'' /= []
222 then pure corpusId''
223 else mk (Just cName) c rootId userId
224
225 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
226
227 pure (userId, rootId, corpusId)
228
229
230 ------------------------------------------------------------------------
231
232
233 class UniqId a
234 where
235 uniqId :: Lens' a (Maybe HashId)
236
237
238 instance UniqId HyperdataDocument
239 where
240 uniqId = hyperdataDocument_uniqId
241
242 instance UniqId HyperdataContact
243 where
244 uniqId = hc_uniqId
245
246 viewUniqId' :: UniqId a => a -> (HashId, a)
247 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
248 where
249 err = panic "[ERROR] Database.Flow.toInsert"
250
251
252 toInserted :: [ReturnId] -> Map HashId ReturnId
253 toInserted = DM.fromList . map (\r -> (reUniqId r, r) )
254 . filter (\r -> reInserted r == True)
255
256 data DocumentWithId a = DocumentWithId
257 { documentId :: !NodeId
258 , documentData :: !a
259 } deriving (Show)
260
261 mergeData :: Map HashId ReturnId
262 -> Map HashId a
263 -> [DocumentWithId a]
264 mergeData rs = catMaybes . map toDocumentWithId . DM.toList
265 where
266 toDocumentWithId (hash,hpd) =
267 DocumentWithId <$> fmap reId (lookup hash rs)
268 <*> Just hpd
269
270 ------------------------------------------------------------------------
271 data DocumentIdWithNgrams a = DocumentIdWithNgrams
272 { documentWithId :: !(DocumentWithId a)
273 , document_ngrams :: !(Map Ngrams (Map NgramsType Int))
274 } deriving (Show)
275
276 -- TODO extractNgrams according to Type of Data
277
278 class ExtractNgramsT h
279 where
280 extractNgramsT :: TermType Lang -> h -> Cmd err (Map Ngrams (Map NgramsType Int))
281
282
283 instance ExtractNgramsT HyperdataContact
284 where
285 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
286 where
287 extract :: TermType Lang -> HyperdataContact
288 -> Cmd err (Map Ngrams (Map NgramsType Int))
289 extract _l hc' = do
290 let authors = map text2ngrams
291 $ maybe ["Nothing"] (\a -> [a])
292 $ view (hc_who . _Just . cw_lastName) hc'
293
294 pure $ DM.fromList $ [(a', DM.singleton Authors 1) | a' <- authors ]
295
296
297
298
299 instance ExtractNgramsT HyperdataDocument
300 where
301 extractNgramsT = extractNgramsT'
302
303 extractNgramsT' :: TermType Lang -> HyperdataDocument
304 -> Cmd err (Map Ngrams (Map NgramsType Int))
305 extractNgramsT' lang hd = filterNgramsT 255 <$> extractNgramsT'' lang hd
306 where
307 extractNgramsT'' :: TermType Lang -> HyperdataDocument
308 -> Cmd err (Map Ngrams (Map NgramsType Int))
309 extractNgramsT'' lang' doc = do
310 let source = text2ngrams
311 $ maybe "Nothing" identity
312 $ _hyperdataDocument_source doc
313
314 institutes = map text2ngrams
315 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
316 $ _hyperdataDocument_institutes doc
317
318 authors = map text2ngrams
319 $ maybe ["Nothing"] (splitOn ", ")
320 $ _hyperdataDocument_authors doc
321
322 leText = catMaybes [ _hyperdataDocument_title doc
323 , _hyperdataDocument_abstract doc
324 ]
325
326 terms' <- map text2ngrams
327 <$> map (intercalate " " . _terms_label)
328 <$> concat
329 <$> liftIO (extractTerms lang' leText)
330
331 pure $ DM.fromList $ [(source, DM.singleton Sources 1)]
332 <> [(i', DM.singleton Institutes 1) | i' <- institutes ]
333 <> [(a', DM.singleton Authors 1) | a' <- authors ]
334 <> [(t', DM.singleton NgramsTerms 1) | t' <- terms' ]
335
336
337 filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
338 -> Map Ngrams (Map NgramsType Int)
339 filterNgramsT s ms = DM.fromList $ map (\a -> filter' s a) $ DM.toList ms
340 where
341 filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
342 True -> (ng,y)
343 False -> (Ngrams (Text.take s' t) n , y)
344
345
346 documentIdWithNgrams :: HasNodeError err
347 => (a
348 -> Cmd err (Map Ngrams (Map NgramsType Int)))
349 -> [DocumentWithId a]
350 -> Cmd err [DocumentIdWithNgrams a]
351 documentIdWithNgrams f = mapM toDocumentIdWithNgrams
352 where
353 toDocumentIdWithNgrams d = do
354 e <- f $ documentData d
355 pure $ DocumentIdWithNgrams d e
356
357
358
359 -- FLOW LIST
360 -- | TODO check optimization
361 mapNodeIdNgrams :: [DocumentIdWithNgrams a]
362 -> Map Ngrams (Map NgramsType (Map NodeId Int))
363 mapNodeIdNgrams = DM.unionsWith (DM.unionWith (DM.unionWith (+))) . fmap f
364 where
365 f :: DocumentIdWithNgrams a
366 -> Map Ngrams (Map NgramsType (Map NodeId Int))
367 f d = fmap (fmap (DM.singleton nId)) $ document_ngrams d
368 where
369 nId = documentId $ documentWithId d
370
371 ------------------------------------------------------------------------
372 listInsert :: FlowCmdM env err m
373 => ListId -> Map NgramsType [NgramsElement]
374 -> m ()
375 listInsert lId ngs = mapM_ (\(typeList, ngElmts)
376 -> putListNgrams lId typeList ngElmts
377 ) $ toList ngs
378
379 flowList :: FlowCmdM env err m => UserId -> CorpusId
380 -> Map NgramsType [NgramsElement]
381 -> m ListId
382 flowList uId cId ngs = do
383 lId <- getOrMkList cId uId
384 printDebug "listId flowList" lId
385 listInsert lId ngs
386 pure lId
387