]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Flow.hs
[SEARCH] and create corpus
[gargantext.git] / src / Gargantext / Database / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10
11 -- TODO-ACCESS:
12 -- check userId CanFillUserCorpus userCorpusId
13 -- check masterUserId CanFillMasterCorpus masterCorpusId
14
15 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
16 -- TODO-EVENTS: InsertedNodes
17
18
19 -}
20
21 {-# LANGUAGE ConstraintKinds #-}
22 {-# LANGUAGE DeriveGeneric #-}
23 {-# LANGUAGE NoImplicitPrelude #-}
24 {-# LANGUAGE OverloadedStrings #-}
25 {-# LANGUAGE RankNTypes #-}
26 {-# LANGUAGE FlexibleContexts #-}
27
28 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
29 where
30
31 --import Debug.Trace (trace)
32 import Control.Lens ((^.), view, Lens', _Just)
33 import Control.Monad (mapM_)
34 import Control.Monad.IO.Class (liftIO)
35 import Data.List (concat)
36 import Data.Map (Map, lookup, toList)
37 import Data.Maybe (Maybe(..), catMaybes)
38 import Data.Monoid
39 import Data.Text (Text, splitOn, intercalate)
40 import GHC.Show (Show)
41 import Gargantext.API.Ngrams (HasRepoVar)
42 import Gargantext.API.Ngrams (NgramsElement(..), putListNgrams, RepoCmdM)
43 import Gargantext.Core (Lang(..))
44 import Gargantext.Core.Types (NodePoly(..), Terms(..))
45 import Gargantext.Core.Types.Individu (Username)
46 import Gargantext.Core.Types.Main
47 import Gargantext.Database.Config (userMaster, corpusMasterName)
48 import Gargantext.Database.Flow.Utils (insertDocNgrams)
49 import Gargantext.Database.Node.Contact -- (HyperdataContact(..), ContactWho(..))
50 import Gargantext.Database.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
51 import Gargantext.Database.Root (getRoot)
52 import Gargantext.Database.Schema.Ngrams -- (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
53 import Gargantext.Database.Schema.Node -- (mkRoot, mkCorpus, getOrMkList, mkGraph, mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
54 import Gargantext.Database.Schema.User (getUser, UserLight(..))
55 import Gargantext.Database.TextSearch (searchInDatabase)
56 import Gargantext.Database.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
57 import Gargantext.Database.Utils (Cmd, CmdM)
58 import Gargantext.Ext.IMT (toSchoolName)
59 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
60 import Gargantext.Prelude
61 import Gargantext.Text.List (buildNgramsLists,StopSize(..))
62 import Gargantext.Text.Parsers (parseFile, FileFormat)
63 import Gargantext.Text.Terms (TermType(..), tt_lang)
64 import Gargantext.Text.Terms (extractTerms)
65 import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
66 import Servant (ServantErr)
67 import System.FilePath (FilePath)
68 --import qualified Data.List as List
69 import qualified Data.Map as Map
70 import qualified Data.Text as Text
71 import qualified Gargantext.Database.Node.Document.Add as Doc (add)
72 import qualified Gargantext.Text.Parsers.GrandDebat as GD
73
74 type FlowCmdM env err m =
75 ( CmdM env err m
76 , RepoCmdM env err m
77 , HasNodeError err
78 , HasRepoVar env
79 )
80
81 type FlowCorpus a = ( AddUniqId a
82 , UniqId a
83 , InsertDb a
84 , ExtractNgramsT a
85 )
86
87 ------------------------------------------------------------------------
88
89 flowAnnuaire :: FlowCmdM env ServantErr m
90 => Username -> CorpusName -> (TermType Lang) -> FilePath -> m AnnuaireId
91 flowAnnuaire u n l filePath = do
92 docs <- liftIO $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
93 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
94
95
96 flowCorpusDebat :: FlowCmdM env ServantErr m
97 => Username -> CorpusName
98 -> Limit -> FilePath
99 -> m CorpusId
100 flowCorpusDebat u n l fp = do
101 docs <- liftIO ( splitEvery 500
102 <$> take l
103 <$> GD.readFile fp
104 :: IO [[GD.GrandDebatReference ]]
105 )
106 flowCorpus u n (Multi FR) (map (map toHyperdataDocument) docs)
107
108
109 flowCorpusFile :: FlowCmdM env ServantErr m
110 => Username -> CorpusName
111 -> Limit -- Limit the number of docs (for dev purpose)
112 -> TermType Lang -> FileFormat -> FilePath
113 -> m CorpusId
114 flowCorpusFile u n l la ff fp = do
115 docs <- liftIO ( splitEvery 500
116 <$> take l
117 <$> parseFile ff fp
118 )
119 flowCorpus u n la (map (map toHyperdataDocument) docs)
120
121 -- TODO query with complex query
122 flowCorpusSearchInDatabase :: FlowCmdM env err m
123 => Username -> Lang -> Text -> m CorpusId
124 flowCorpusSearchInDatabase u la q = do
125 (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus userMaster "" (Nothing :: Maybe HyperdataCorpus)
126 ids <- map fst <$> searchInDatabase cId (stemIt q)
127 flowCorpusUser la u q (Nothing :: Maybe HyperdataCorpus) ids
128
129
130 flowCorpusSearchInDatabase' :: FlowCmdM env ServantErr m
131 => Username -> Lang -> Text -> m CorpusId
132 flowCorpusSearchInDatabase' u la q = do
133 (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus userMaster "" (Nothing :: Maybe HyperdataCorpus)
134 ids <- map fst <$> searchInDatabase cId (stemIt q)
135 flowCorpusUser la u q (Nothing :: Maybe HyperdataCorpus) ids
136
137
138 ------------------------------------------------------------------------
139
140 flow :: (FlowCmdM env ServantErr m, FlowCorpus a, MkCorpus c)
141 => Maybe c -> Username -> CorpusName -> TermType Lang -> [[a]] -> m CorpusId
142 flow c u cn la docs = do
143 ids <- mapM (insertMasterDocs c la ) docs
144 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
145
146 flowCorpus :: (FlowCmdM env ServantErr m, FlowCorpus a)
147 => Username -> CorpusName -> TermType Lang -> [[a]] -> m CorpusId
148 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
149
150
151 flowCorpusUser :: (FlowCmdM env err m, MkCorpus c)
152 => Lang -> Username -> CorpusName -> Maybe c -> [NodeId] -> m CorpusId
153 flowCorpusUser l userName corpusName ctype ids = do
154 -- User Flow
155 (userId, _rootId, userCorpusId) <- getOrMkRootWithCorpus userName corpusName ctype
156 -- TODO: check if present already, ignore
157 _ <- Doc.add userCorpusId ids
158
159 -- User List Flow
160 --{-
161 (_masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster "" ctype
162 ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
163 userListId <- flowList userId userCorpusId ngs
164 printDebug "userListId" userListId
165 -- User Graph Flow
166 _ <- mkGraph userCorpusId userId
167 --}
168
169 -- User Dashboard Flow
170 _ <- mkDashboard userCorpusId userId
171
172 -- Annuaire Flow
173 -- _ <- mkAnnuaire rootUserId userId
174 pure userCorpusId
175
176
177 insertMasterDocs :: ( FlowCmdM env ServantErr m
178 , FlowCorpus a
179 , MkCorpus c
180 )
181 => Maybe c -> TermType Lang -> [a] -> m [DocId]
182 insertMasterDocs c lang hs = do
183 (masterUserId, _, masterCorpusId) <- getOrMkRootWithCorpus userMaster corpusMasterName c
184
185 -- TODO Type NodeDocumentUnicised
186 let hs' = map addUniqId hs
187 ids <- insertDb masterUserId masterCorpusId hs'
188 let documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' hs')
189
190 maps <- mapNodeIdNgrams <$> documentIdWithNgrams (extractNgramsT lang) documentsWithId
191 terms2id <- insertNgrams $ Map.keys maps
192 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) maps
193
194 lId <- getOrMkList masterCorpusId masterUserId
195 _ <- insertDocNgrams lId indexedNgrams
196 pure $ map reId ids
197
198
199
200 type CorpusName = Text
201
202 getOrMkRootWithCorpus :: (HasNodeError err, MkCorpus a)
203 => Username -> CorpusName -> Maybe a
204 -> Cmd err (UserId, RootId, CorpusId)
205 getOrMkRootWithCorpus username cName c = do
206 maybeUserId <- getUser username
207 userId <- case maybeUserId of
208 Nothing -> nodeError NoUserFound
209 Just user -> pure $ userLight_id user
210
211 rootId' <- map _node_id <$> getRoot username
212
213 rootId'' <- case rootId' of
214 [] -> mkRoot username userId
215 n -> case length n >= 2 of
216 True -> nodeError ManyNodeUsers
217 False -> pure rootId'
218
219 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
220
221 corpusId'' <- if username == userMaster
222 then do
223 ns <- getCorporaWithParentId rootId
224 pure $ map _node_id ns
225 else
226 pure []
227
228 corpusId' <- if corpusId'' /= []
229 then pure corpusId''
230 else mk (Just cName) c rootId userId
231
232 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
233
234 pure (userId, rootId, corpusId)
235
236
237 ------------------------------------------------------------------------
238
239
240 class UniqId a
241 where
242 uniqId :: Lens' a (Maybe HashId)
243
244
245 instance UniqId HyperdataDocument
246 where
247 uniqId = hyperdataDocument_uniqId
248
249 instance UniqId HyperdataContact
250 where
251 uniqId = hc_uniqId
252
253 viewUniqId' :: UniqId a => a -> (HashId, a)
254 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
255 where
256 err = panic "[ERROR] Database.Flow.toInsert"
257
258
259 toInserted :: [ReturnId] -> Map HashId ReturnId
260 toInserted = Map.fromList . map (\r -> (reUniqId r, r) )
261 . filter (\r -> reInserted r == True)
262
263 data DocumentWithId a = DocumentWithId
264 { documentId :: !NodeId
265 , documentData :: !a
266 } deriving (Show)
267
268 mergeData :: Map HashId ReturnId
269 -> Map HashId a
270 -> [DocumentWithId a]
271 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
272 where
273 toDocumentWithId (hash,hpd) =
274 DocumentWithId <$> fmap reId (lookup hash rs)
275 <*> Just hpd
276
277 ------------------------------------------------------------------------
278 data DocumentIdWithNgrams a = DocumentIdWithNgrams
279 { documentWithId :: !(DocumentWithId a)
280 , document_ngrams :: !(Map Ngrams (Map NgramsType Int))
281 } deriving (Show)
282
283 -- TODO extractNgrams according to Type of Data
284
285 class ExtractNgramsT h
286 where
287 extractNgramsT :: TermType Lang -> h -> Cmd err (Map Ngrams (Map NgramsType Int))
288
289
290 instance ExtractNgramsT HyperdataContact
291 where
292 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
293 where
294 extract :: TermType Lang -> HyperdataContact
295 -> Cmd err (Map Ngrams (Map NgramsType Int))
296 extract _l hc' = do
297 let authors = map text2ngrams
298 $ maybe ["Nothing"] (\a -> [a])
299 $ view (hc_who . _Just . cw_lastName) hc'
300
301 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
302
303
304
305
306 instance ExtractNgramsT HyperdataDocument
307 where
308 extractNgramsT = extractNgramsT'
309
310 extractNgramsT' :: TermType Lang -> HyperdataDocument
311 -> Cmd err (Map Ngrams (Map NgramsType Int))
312 extractNgramsT' lang hd = filterNgramsT 255 <$> extractNgramsT'' lang hd
313 where
314 extractNgramsT'' :: TermType Lang -> HyperdataDocument
315 -> Cmd err (Map Ngrams (Map NgramsType Int))
316 extractNgramsT'' lang' doc = do
317 let source = text2ngrams
318 $ maybe "Nothing" identity
319 $ _hyperdataDocument_source doc
320
321 institutes = map text2ngrams
322 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
323 $ _hyperdataDocument_institutes doc
324
325 authors = map text2ngrams
326 $ maybe ["Nothing"] (splitOn ", ")
327 $ _hyperdataDocument_authors doc
328
329 leText = catMaybes [ _hyperdataDocument_title doc
330 , _hyperdataDocument_abstract doc
331 ]
332
333 terms' <- map text2ngrams
334 <$> map (intercalate " " . _terms_label)
335 <$> concat
336 <$> liftIO (extractTerms lang' leText)
337
338 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
339 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
340 <> [(a', Map.singleton Authors 1) | a' <- authors ]
341 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
342
343
344 filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
345 -> Map Ngrams (Map NgramsType Int)
346 filterNgramsT s ms = Map.fromList $ map (\a -> filter' s a) $ Map.toList ms
347 where
348 filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
349 True -> (ng,y)
350 False -> (Ngrams (Text.take s' t) n , y)
351
352
353 documentIdWithNgrams :: HasNodeError err
354 => (a
355 -> Cmd err (Map Ngrams (Map NgramsType Int)))
356 -> [DocumentWithId a]
357 -> Cmd err [DocumentIdWithNgrams a]
358 documentIdWithNgrams f = mapM toDocumentIdWithNgrams
359 where
360 toDocumentIdWithNgrams d = do
361 e <- f $ documentData d
362 pure $ DocumentIdWithNgrams d e
363
364
365
366 -- FLOW LIST
367 -- | TODO check optimization
368 mapNodeIdNgrams :: [DocumentIdWithNgrams a]
369 -> Map Ngrams (Map NgramsType (Map NodeId Int))
370 mapNodeIdNgrams = Map.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
371 where
372 f :: DocumentIdWithNgrams a
373 -> Map Ngrams (Map NgramsType (Map NodeId Int))
374 f d = fmap (fmap (Map.singleton nId)) $ document_ngrams d
375 where
376 nId = documentId $ documentWithId d
377
378 ------------------------------------------------------------------------
379 listInsert :: FlowCmdM env err m
380 => ListId -> Map NgramsType [NgramsElement]
381 -> m ()
382 listInsert lId ngs = mapM_ (\(typeList, ngElmts)
383 -> putListNgrams lId typeList ngElmts
384 ) $ toList ngs
385
386 flowList :: FlowCmdM env err m => UserId -> CorpusId
387 -> Map NgramsType [NgramsElement]
388 -> m ListId
389 flowList uId cId ngs = do
390 lId <- getOrMkList cId uId
391 printDebug "listId flowList" lId
392 listInsert lId ngs
393 --trace (show $ List.filter (\n -> _ne_ngrams n == "versatile") $ List.concat $ Map.elems ngs) $ listInsert lId ngs
394 pure lId
395