]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Flow.hs
working on the adaptative matching
[gargantext.git] / src / Gargantext / Database / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10
11 -- TODO-ACCESS:
12 -- check userId CanFillUserCorpus userCorpusId
13 -- check masterUserId CanFillMasterCorpus masterCorpusId
14
15 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
16 -- TODO-EVENTS: InsertedNodes
17
18
19 -}
20
21 {-# LANGUAGE ConstraintKinds #-}
22 {-# LANGUAGE DeriveGeneric #-}
23 {-# LANGUAGE FlexibleContexts #-}
24 {-# LANGUAGE InstanceSigs #-}
25 {-# LANGUAGE NoImplicitPrelude #-}
26 {-# LANGUAGE OverloadedStrings #-}
27 {-# LANGUAGE RankNTypes #-}
28 {-# LANGUAGE ConstrainedClassMethods #-}
29
30 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
31 where
32 import Prelude (String)
33 import Debug.Trace (trace)
34 import Control.Lens ((^.), view, Lens', _Just)
35 import Control.Monad (mapM_)
36 import Control.Monad.IO.Class (liftIO)
37 import Data.List (concat)
38 import Data.Map (Map, lookup, toList)
39 import Data.Maybe (Maybe(..), catMaybes)
40 import Data.Monoid
41 import Data.Text (Text, splitOn, intercalate)
42 import GHC.Show (Show)
43 import Gargantext.API.Ngrams (HasRepoVar)
44 import Gargantext.API.Ngrams (NgramsElement(..), putListNgrams, RepoCmdM)
45 import Gargantext.Core (Lang(..))
46 import Gargantext.Core.Types (NodePoly(..), Terms(..))
47 import Gargantext.Core.Types.Individu (Username)
48 import Gargantext.Core.Types.Main
49 import Gargantext.Database.Config (userMaster, corpusMasterName)
50 import Gargantext.Database.Flow.Utils (insertDocNgrams)
51 import Gargantext.Database.Node.Contact -- (HyperdataContact(..), ContactWho(..))
52 import Gargantext.Database.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
53 import Gargantext.Database.Root (getRoot)
54 import Gargantext.Database.Schema.Ngrams -- (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
55 import Gargantext.Database.Schema.Node -- (mkRoot, mkCorpus, getOrMkList, mkGraph, mkPhylo, mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
56 import Gargantext.Database.Schema.User (getUser, UserLight(..))
57 import Gargantext.Database.TextSearch (searchInDatabase)
58 import Gargantext.Database.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
59 import Gargantext.Database.Utils (Cmd, CmdM)
60 import Gargantext.Ext.IMT (toSchoolName)
61 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
62 import Gargantext.Prelude
63 import Gargantext.Text.Terms.Eleve (buildTries, toToken)
64 import Gargantext.Text.List (buildNgramsLists,StopSize(..))
65 import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat)
66 import qualified Gargantext.Text.Corpus.API.Isidore as Isidore
67 import Gargantext.Text.Terms (TermType(..), tt_lang, extractTerms, uniText)
68 import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
69 import Servant (ServantErr)
70 import System.FilePath (FilePath)
71 import qualified Data.List as List
72 import qualified Data.Map as Map
73 import qualified Data.Text as Text
74 import qualified Gargantext.Database.Node.Document.Add as Doc (add)
75 import qualified Gargantext.Text.Corpus.Parsers.GrandDebat as GD
76
77 type FlowCmdM env err m =
78 ( CmdM env err m
79 , RepoCmdM env err m
80 , HasNodeError err
81 , HasRepoVar env
82 )
83
84 type FlowCorpus a = ( AddUniqId a
85 , UniqId a
86 , InsertDb a
87 , ExtractNgramsT a
88 , HasText a
89 )
90
91 ------------------------------------------------------------------------
92
93 data ApiQuery = ApiIsidoreQuery Text | ApiIsidoreAuth Text
94 -- | APIs
95 -- TODO instances
96 getDataApi :: Lang
97 -> Maybe Limit
98 -> ApiQuery
99 -> IO [HyperdataDocument]
100 getDataApi lang limit (ApiIsidoreQuery q) = Isidore.get lang limit (Just q) Nothing
101 getDataApi lang limit (ApiIsidoreAuth q) = Isidore.get lang limit Nothing (Just q)
102
103
104 flowCorpusApi :: ( FlowCmdM env ServantErr m)
105 => Username -> CorpusName
106 -> TermType Lang
107 -> Maybe Limit
108 -> ApiQuery
109 -> m CorpusId
110 flowCorpusApi u n tt l q = do
111 docs <- liftIO $ splitEvery 500 <$> getDataApi (_tt_lang tt) l q
112 flowCorpus u n tt docs
113
114 ------------------------------------------------------------------------
115
116
117 flowAnnuaire :: FlowCmdM env ServantErr m
118 => Username -> CorpusName -> (TermType Lang) -> FilePath -> m AnnuaireId
119 flowAnnuaire u n l filePath = do
120 docs <- liftIO $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
121 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
122
123
124 flowCorpusDebat :: FlowCmdM env ServantErr m
125 => Username -> CorpusName
126 -> Limit -> FilePath
127 -> m CorpusId
128 flowCorpusDebat u n l fp = do
129 docs <- liftIO ( splitEvery 500
130 <$> take l
131 <$> GD.readFile fp
132 :: IO [[GD.GrandDebatReference ]]
133 )
134 flowCorpus u n (Multi FR) (map (map toHyperdataDocument) docs)
135
136 flowCorpusFile :: FlowCmdM env ServantErr m
137 => Username -> CorpusName
138 -> Limit -- Limit the number of docs (for dev purpose)
139 -> TermType Lang -> FileFormat -> FilePath
140 -> m CorpusId
141 flowCorpusFile u n l la ff fp = do
142 docs <- liftIO ( splitEvery 500
143 <$> take l
144 <$> parseFile ff fp
145 )
146 flowCorpus u n la (map (map toHyperdataDocument) docs)
147
148 -- TODO query with complex query
149 flowCorpusSearchInDatabase :: FlowCmdM env err m
150 => Username -> Lang -> Text -> m CorpusId
151 flowCorpusSearchInDatabase u la q = do
152 (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus userMaster "" (Nothing :: Maybe HyperdataCorpus)
153 ids <- map fst <$> searchInDatabase cId (stemIt q)
154 flowCorpusUser la u q (Nothing :: Maybe HyperdataCorpus) ids
155
156
157 flowCorpusSearchInDatabase' :: FlowCmdM env ServantErr m
158 => Username -> Lang -> Text -> m CorpusId
159 flowCorpusSearchInDatabase' u la q = do
160 (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus userMaster "" (Nothing :: Maybe HyperdataCorpus)
161 ids <- map fst <$> searchInDatabase cId (stemIt q)
162 flowCorpusUser la u q (Nothing :: Maybe HyperdataCorpus) ids
163
164 ------------------------------------------------------------------------
165
166 flow :: (FlowCmdM env ServantErr m, FlowCorpus a, MkCorpus c)
167 => Maybe c -> Username -> CorpusName -> TermType Lang -> [[a]] -> m CorpusId
168 flow c u cn la docs = do
169 ids <- mapM (insertMasterDocs c la ) docs
170 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
171
172 flowCorpus :: (FlowCmdM env ServantErr m, FlowCorpus a)
173 => Username -> CorpusName -> TermType Lang -> [[a]] -> m CorpusId
174 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
175
176
177 flowCorpusUser :: (FlowCmdM env err m, MkCorpus c)
178 => Lang -> Username -> CorpusName -> Maybe c -> [NodeId] -> m CorpusId
179 flowCorpusUser l userName corpusName ctype ids = do
180 -- User Flow
181 (userId, _rootId, userCorpusId) <- getOrMkRootWithCorpus userName corpusName ctype
182 -- TODO: check if present already, ignore
183 _ <- Doc.add userCorpusId ids
184
185 -- User List Flow
186 --{-
187 (_masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster "" ctype
188 ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
189 userListId <- flowList userId userCorpusId ngs
190 printDebug "userListId" userListId
191 -- User Graph Flow
192 _ <- mkGraph userCorpusId userId
193 _ <- mkPhylo userCorpusId userId
194 --}
195
196 -- User Dashboard Flow
197 _ <- mkDashboard userCorpusId userId
198
199 -- Annuaire Flow
200 -- _ <- mkAnnuaire rootUserId userId
201 pure userCorpusId
202
203
204 insertMasterDocs :: ( FlowCmdM env ServantErr m
205 , FlowCorpus a
206 , MkCorpus c
207 )
208 => Maybe c -> TermType Lang -> [a] -> m [DocId]
209 insertMasterDocs c lang hs = do
210 (masterUserId, _, masterCorpusId) <- getOrMkRootWithCorpus userMaster corpusMasterName c
211
212 -- TODO Type NodeDocumentUnicised
213 let hs' = map addUniqId hs
214 ids <- insertDb masterUserId masterCorpusId hs'
215 let documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' hs')
216
217 let
218 fixLang (Unsupervised l n s m) = Unsupervised l n s m'
219 where
220 m' = case m of
221 Nothing -> trace ("buildTries here" :: String)
222 $ Just
223 $ buildTries n ( fmap toToken $ uniText
224 $ Text.intercalate " . "
225 $ List.concat
226 $ map hasText documentsWithId
227 )
228 just_m -> just_m
229 fixLang l = l
230
231 lang' = fixLang lang
232 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
233 maps <- mapNodeIdNgrams <$> documentIdWithNgrams (extractNgramsT lang') documentsWithId
234 terms2id <- insertNgrams $ Map.keys maps
235 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) maps
236
237 lId <- getOrMkList masterCorpusId masterUserId
238 _ <- insertDocNgrams lId indexedNgrams
239 pure $ map reId ids
240
241
242
243 type CorpusName = Text
244
245 getOrMkRootWithCorpus :: (HasNodeError err, MkCorpus a)
246 => Username -> CorpusName -> Maybe a
247 -> Cmd err (UserId, RootId, CorpusId)
248 getOrMkRootWithCorpus username cName c = do
249 maybeUserId <- getUser username
250 userId <- case maybeUserId of
251 Nothing -> nodeError NoUserFound
252 Just user -> pure $ userLight_id user
253
254 rootId' <- map _node_id <$> getRoot username
255
256 rootId'' <- case rootId' of
257 [] -> mkRoot username userId
258 n -> case length n >= 2 of
259 True -> nodeError ManyNodeUsers
260 False -> pure rootId'
261
262 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
263
264 corpusId'' <- if username == userMaster
265 then do
266 ns <- getCorporaWithParentId rootId
267 pure $ map _node_id ns
268 else
269 pure []
270
271 corpusId' <- if corpusId'' /= []
272 then pure corpusId''
273 else mk (Just cName) c rootId userId
274
275 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
276
277 pure (userId, rootId, corpusId)
278
279
280 ------------------------------------------------------------------------
281
282
283 class UniqId a
284 where
285 uniqId :: Lens' a (Maybe HashId)
286
287
288 instance UniqId HyperdataDocument
289 where
290 uniqId = hyperdataDocument_uniqId
291
292 instance UniqId HyperdataContact
293 where
294 uniqId = hc_uniqId
295
296 viewUniqId' :: UniqId a => a -> (HashId, a)
297 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
298 where
299 err = panic "[ERROR] Database.Flow.toInsert"
300
301
302 toInserted :: [ReturnId] -> Map HashId ReturnId
303 toInserted = Map.fromList . map (\r -> (reUniqId r, r) )
304 . filter (\r -> reInserted r == True)
305
306 data DocumentWithId a = DocumentWithId
307 { documentId :: !NodeId
308 , documentData :: !a
309 } deriving (Show)
310
311 instance HasText a => HasText (DocumentWithId a)
312 where
313 hasText (DocumentWithId _ a) = hasText a
314
315 mergeData :: Map HashId ReturnId
316 -> Map HashId a
317 -> [DocumentWithId a]
318 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
319 where
320 toDocumentWithId (hash,hpd) =
321 DocumentWithId <$> fmap reId (lookup hash rs)
322 <*> Just hpd
323
324 ------------------------------------------------------------------------
325 data DocumentIdWithNgrams a = DocumentIdWithNgrams
326 { documentWithId :: !(DocumentWithId a)
327 , document_ngrams :: !(Map Ngrams (Map NgramsType Int))
328 } deriving (Show)
329
330
331 class ExtractNgramsT h
332 where
333 extractNgramsT :: HasText h => TermType Lang -> h -> Cmd err (Map Ngrams (Map NgramsType Int))
334
335 class HasText h
336 where
337 hasText :: h -> [Text]
338
339 instance HasText HyperdataContact
340 where
341 hasText = undefined
342
343 instance ExtractNgramsT HyperdataContact
344 where
345 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
346 where
347 extract :: TermType Lang -> HyperdataContact
348 -> Cmd err (Map Ngrams (Map NgramsType Int))
349 extract _l hc' = do
350 let authors = map text2ngrams
351 $ maybe ["Nothing"] (\a -> [a])
352 $ view (hc_who . _Just . cw_lastName) hc'
353
354 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
355
356 instance HasText HyperdataDocument
357 where
358 hasText h = catMaybes [ _hyperdataDocument_title h
359 , _hyperdataDocument_abstract h
360 ]
361
362 instance ExtractNgramsT HyperdataDocument
363 where
364 extractNgramsT :: TermType Lang -> HyperdataDocument -> Cmd err (Map Ngrams (Map NgramsType Int))
365 extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
366 where
367 extractNgramsT' :: TermType Lang -> HyperdataDocument
368 -> Cmd err (Map Ngrams (Map NgramsType Int))
369 extractNgramsT' lang' doc = do
370 let source = text2ngrams
371 $ maybe "Nothing" identity
372 $ _hyperdataDocument_source doc
373
374 institutes = map text2ngrams
375 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
376 $ _hyperdataDocument_institutes doc
377
378 authors = map text2ngrams
379 $ maybe ["Nothing"] (splitOn ", ")
380 $ _hyperdataDocument_authors doc
381
382 terms' <- map text2ngrams
383 <$> map (intercalate " " . _terms_label)
384 <$> concat
385 <$> liftIO (extractTerms lang' $ hasText doc)
386
387 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
388 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
389 <> [(a', Map.singleton Authors 1) | a' <- authors ]
390 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
391
392
393 filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
394 -> Map Ngrams (Map NgramsType Int)
395 filterNgramsT s ms = Map.fromList $ map (\a -> filter' s a) $ Map.toList ms
396 where
397 filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
398 True -> (ng,y)
399 False -> (Ngrams (Text.take s' t) n , y)
400
401
402 documentIdWithNgrams :: HasNodeError err
403 => (a
404 -> Cmd err (Map Ngrams (Map NgramsType Int)))
405 -> [DocumentWithId a]
406 -> Cmd err [DocumentIdWithNgrams a]
407 documentIdWithNgrams f = mapM toDocumentIdWithNgrams
408 where
409 toDocumentIdWithNgrams d = do
410 e <- f $ documentData d
411 pure $ DocumentIdWithNgrams d e
412
413
414 -- FLOW LIST
415 -- | TODO check optimization
416 mapNodeIdNgrams :: [DocumentIdWithNgrams a]
417 -> Map Ngrams (Map NgramsType (Map NodeId Int))
418 mapNodeIdNgrams = Map.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
419 where
420 f :: DocumentIdWithNgrams a
421 -> Map Ngrams (Map NgramsType (Map NodeId Int))
422 f d = fmap (fmap (Map.singleton nId)) $ document_ngrams d
423 where
424 nId = documentId $ documentWithId d
425
426 ------------------------------------------------------------------------
427 listInsert :: FlowCmdM env err m
428 => ListId -> Map NgramsType [NgramsElement]
429 -> m ()
430 listInsert lId ngs = mapM_ (\(typeList, ngElmts)
431 -> putListNgrams lId typeList ngElmts
432 ) $ toList ngs
433
434 flowList :: FlowCmdM env err m => UserId -> CorpusId
435 -> Map NgramsType [NgramsElement]
436 -> m ListId
437 flowList uId cId ngs = do
438 lId <- getOrMkList cId uId
439 printDebug "listId flowList" lId
440 listInsert lId ngs
441 --trace (show $ List.filter (\n -> _ne_ngrams n == "versatile") $ List.concat $ Map.elems ngs) $ listInsert lId ngs
442 pure lId
443