]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Flow.hs
[MERGE] Ngrams repos fixes
[gargantext.git] / src / Gargantext / Database / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -- TODO-ACCESS:
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
13
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
16 -}
17
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
19
20 {-# LANGUAGE ConstraintKinds #-}
21 {-# LANGUAGE RankNTypes #-}
22 {-# LANGUAGE ConstrainedClassMethods #-}
23 {-# LANGUAGE ConstraintKinds #-}
24 {-# LANGUAGE DeriveGeneric #-}
25 {-# LANGUAGE FlexibleContexts #-}
26 {-# LANGUAGE InstanceSigs #-}
27 {-# LANGUAGE NoImplicitPrelude #-}
28 {-# LANGUAGE OverloadedStrings #-}
29
30 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
31 ( FlowCmdM
32 , flowCorpusFile
33 , flowCorpus
34 , flowCorpusSearchInDatabase
35 , getOrMkRoot
36 , getOrMkRootWithCorpus
37 , flowAnnuaire
38 )
39 where
40 import Prelude (String)
41 import Data.Either
42 import Debug.Trace (trace)
43 import Control.Lens ((^.), view, _Just)
44 import Control.Monad.IO.Class (liftIO)
45 import Data.List (concat)
46 import Data.Map (Map, lookup)
47 import Data.Maybe (Maybe(..), catMaybes)
48 import Data.Monoid
49 import Data.Text (Text, splitOn, intercalate)
50 import Gargantext.Core (Lang(..))
51 import Gargantext.Core.Types (NodePoly(..), Terms(..))
52 import Gargantext.Core.Types.Individu (Username)
53 import Gargantext.Core.Flow
54 import Gargantext.Core.Types.Main
55 import Gargantext.Database.Config (userMaster, corpusMasterName)
56 import Gargantext.Database.Flow.Utils (insertDocNgrams)
57 import Gargantext.Database.Flow.List
58 import Gargantext.Database.Flow.Types
59 import Gargantext.Database.Node.Contact -- (HyperdataContact(..), ContactWho(..))
60 import Gargantext.Database.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
61 import Gargantext.Database.Root (getRoot)
62 import Gargantext.Database.Schema.Ngrams -- (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
63 import Gargantext.Database.Schema.Node -- (mkRoot, mkCorpus, getOrMkList, mkGraph, {-mkPhylo,-} mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
64 import Gargantext.Database.Schema.User (getUser, UserLight(..))
65 import Gargantext.Database.TextSearch (searchInDatabase)
66 import Gargantext.Database.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
67 import Gargantext.Database.Utils (Cmd)
68 import Gargantext.Ext.IMT (toSchoolName)
69 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
70 import Gargantext.Prelude
71 import Gargantext.Text.Terms.Eleve (buildTries, toToken)
72 import Gargantext.Text.List (buildNgramsLists,StopSize(..))
73 import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat)
74 import qualified Gargantext.Text.Corpus.API.Isidore as Isidore
75 import Gargantext.Text.Terms (TermType(..), tt_lang, extractTerms, uniText)
76 import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
77 import Gargantext.Prelude.Utils hiding (sha)
78 import System.FilePath (FilePath)
79 import qualified Data.List as List
80 import qualified Data.Map as Map
81 import qualified Data.Text as Text
82 import qualified Gargantext.Database.Node.Document.Add as Doc (add)
83 import qualified Gargantext.Text.Corpus.Parsers.GrandDebat as GD
84
85 ------------------------------------------------------------------------
86
87 data ApiQuery = ApiIsidoreQuery Text | ApiIsidoreAuth Text
88 -- | APIs
89 -- TODO instances
90 getDataApi :: Lang
91 -> Maybe Limit
92 -> ApiQuery
93 -> IO [HyperdataDocument]
94 getDataApi lang limit (ApiIsidoreQuery q) = Isidore.get lang limit (Just q) Nothing
95 getDataApi lang limit (ApiIsidoreAuth q) = Isidore.get lang limit Nothing (Just q)
96
97
98 -- UNUSED
99 _flowCorpusApi :: ( FlowCmdM env err m)
100 => Username -> Either CorpusName [CorpusId]
101 -> TermType Lang
102 -> Maybe Limit
103 -> ApiQuery
104 -> m CorpusId
105 _flowCorpusApi u n tt l q = do
106 docs <- liftIO $ splitEvery 500 <$> getDataApi (_tt_lang tt) l q
107 flowCorpus u n tt docs
108
109 ------------------------------------------------------------------------
110
111 flowAnnuaire :: FlowCmdM env err m
112 => Username
113 -> Either CorpusName [CorpusId]
114 -> (TermType Lang)
115 -> FilePath
116 -> m AnnuaireId
117 flowAnnuaire u n l filePath = do
118 docs <- liftIO $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
119 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
120
121 -- UNUSED
122 _flowCorpusDebat :: FlowCmdM env err m
123 => Username -> Either CorpusName [CorpusId]
124 -> Limit -> FilePath
125 -> m CorpusId
126 _flowCorpusDebat u n l fp = do
127 docs <- liftIO ( splitEvery 500
128 <$> take l
129 <$> readFile' fp
130 :: IO [[GD.GrandDebatReference ]]
131 )
132 flowCorpus u n (Multi FR) (map (map toHyperdataDocument) docs)
133
134 flowCorpusFile :: FlowCmdM env err m
135 => Username -> Either CorpusName [CorpusId]
136 -> Limit -- Limit the number of docs (for dev purpose)
137 -> TermType Lang -> FileFormat -> FilePath
138 -> m CorpusId
139 flowCorpusFile u n l la ff fp = do
140 docs <- liftIO ( splitEvery 500
141 <$> take l
142 <$> parseFile ff fp
143 )
144 flowCorpus u n la (map (map toHyperdataDocument) docs)
145
146 -- TODO query with complex query
147 flowCorpusSearchInDatabase :: FlowCmdM env err m
148 => Username
149 -> Lang
150 -> Text
151 -> m CorpusId
152 flowCorpusSearchInDatabase u la q = do
153 (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus
154 userMaster
155 (Left "")
156 (Nothing :: Maybe HyperdataCorpus)
157 ids <- map fst <$> searchInDatabase cId (stemIt q)
158 flowCorpusUser la u (Left q) (Nothing :: Maybe HyperdataCorpus) ids
159
160
161 -- UNUSED
162 _flowCorpusSearchInDatabaseApi :: FlowCmdM env err m
163 => Username
164 -> Lang
165 -> Text
166 -> m CorpusId
167 _flowCorpusSearchInDatabaseApi u la q = do
168 (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus
169 userMaster
170 (Left "")
171 (Nothing :: Maybe HyperdataCorpus)
172 ids <- map fst <$> searchInDatabase cId (stemIt q)
173 flowCorpusUser la u (Left q) (Nothing :: Maybe HyperdataCorpus) ids
174
175 ------------------------------------------------------------------------
176 -- | TODO improve the needed type to create/update a corpus
177 {- UNUSED
178 data UserInfo = Username Text
179 | UserId NodeId
180 data CorpusInfo = CorpusName Lang Text
181 | CorpusId Lang NodeId
182 -}
183
184 flow :: (FlowCmdM env err m, FlowCorpus a, MkCorpus c)
185 => Maybe c
186 -> Username
187 -> Either CorpusName [CorpusId]
188 -> TermType Lang
189 -> [[a]]
190 -> m CorpusId
191 flow c u cn la docs = do
192 ids <- mapM (insertMasterDocs c la ) docs
193 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
194
195 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
196 => Username
197 -> Either CorpusName [CorpusId]
198 -> TermType Lang
199 -> [[a]]
200 -> m CorpusId
201 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
202
203 ------------------------------------------------------------------------
204 flowCorpusUser :: (FlowCmdM env err m, MkCorpus c)
205 => Lang
206 -> Username
207 -> Either CorpusName [CorpusId]
208 -> Maybe c
209 -> [NodeId]
210 -> m CorpusId
211 flowCorpusUser l userName corpusName ctype ids = do
212 -- User Flow
213 (userId, _rootId, userCorpusId) <- getOrMkRootWithCorpus userName corpusName ctype
214 listId <- getOrMkList userCorpusId userId
215 _cooc <- mkNode NodeListCooc listId userId
216 -- TODO: check if present already, ignore
217 _ <- Doc.add userCorpusId ids
218
219 _tId <- mkNode NodeTexts userCorpusId userId
220 -- printDebug "Node Text Id" tId
221
222 -- User List Flow
223 --{-
224 (_masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster (Left "") ctype
225 ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
226 _userListId <- flowList listId ngs
227 --mastListId <- getOrMkList masterCorpusId masterUserId
228 -- _ <- insertOccsUpdates userCorpusId mastListId
229 -- printDebug "userListId" userListId
230 -- User Graph Flow
231 _ <- mkDashboard userCorpusId userId
232 _ <- mkGraph userCorpusId userId
233 --_ <- mkPhylo userCorpusId userId
234 --}
235
236
237 -- Annuaire Flow
238 -- _ <- mkAnnuaire rootUserId userId
239 pure userCorpusId
240
241
242 insertMasterDocs :: ( FlowCmdM env err m
243 , FlowCorpus a
244 , MkCorpus c
245 )
246 => Maybe c
247 -> TermType Lang
248 -> [a]
249 -> m [DocId]
250 insertMasterDocs c lang hs = do
251 (masterUserId, _, masterCorpusId) <- getOrMkRootWithCorpus userMaster (Left corpusMasterName) c
252
253 -- TODO Type NodeDocumentUnicised
254 let hs' = map addUniqId hs
255 ids <- insertDb masterUserId masterCorpusId hs'
256 let documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' hs')
257
258 let
259 fixLang (Unsupervised l n s m) = Unsupervised l n s m'
260 where
261 m' = case m of
262 Nothing -> trace ("buildTries here" :: String)
263 $ Just
264 $ buildTries n ( fmap toToken $ uniText
265 $ Text.intercalate " . "
266 $ List.concat
267 $ map hasText documentsWithId
268 )
269 just_m -> just_m
270 fixLang l = l
271
272 lang' = fixLang lang
273 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
274 maps <- mapNodeIdNgrams <$> documentIdWithNgrams (extractNgramsT lang') documentsWithId
275 terms2id <- insertNgrams $ Map.keys maps
276 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) maps
277
278 lId <- getOrMkList masterCorpusId masterUserId
279 _cooc <- mkNode NodeListCooc lId masterUserId
280 _ <- insertDocNgrams lId indexedNgrams
281
282 pure $ map reId ids
283
284
285 type CorpusName = Text
286
287
288 getOrMkRoot :: (HasNodeError err)
289 => Username
290 -> Cmd err (UserId, RootId)
291 getOrMkRoot username = do
292 maybeUserId <- getUser username
293 userId <- case maybeUserId of
294 Nothing -> nodeError NoUserFound
295 Just user -> pure $ userLight_id user
296
297 rootId' <- map _node_id <$> getRoot username
298
299 rootId'' <- case rootId' of
300 [] -> mkRoot username userId
301 n -> case length n >= 2 of
302 True -> nodeError ManyNodeUsers
303 False -> pure rootId'
304
305 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
306 pure (userId, rootId)
307
308
309 getOrMkRootWithCorpus :: (HasNodeError err, MkCorpus a)
310 => Username
311 -> Either CorpusName [CorpusId]
312 -> Maybe a
313 -> Cmd err (UserId, RootId, CorpusId)
314 getOrMkRootWithCorpus username cName c = do
315 (userId, rootId) <- getOrMkRoot username
316 corpusId'' <- if username == userMaster
317 then do
318 ns <- getCorporaWithParentId rootId
319 pure $ map _node_id ns
320 else
321 pure $ fromRight [] cName
322
323 corpusId' <- if corpusId'' /= []
324 then pure corpusId''
325 else mk (Just $ fromLeft "Default" cName) c rootId userId
326
327 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
328
329 pure (userId, rootId, corpusId)
330
331
332 ------------------------------------------------------------------------
333 viewUniqId' :: UniqId a
334 => a
335 -> (HashId, a)
336 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
337 where
338 err = panic "[ERROR] Database.Flow.toInsert"
339
340
341 toInserted :: [ReturnId]
342 -> Map HashId ReturnId
343 toInserted =
344 Map.fromList . map (\r -> (reUniqId r, r) )
345 . filter (\r -> reInserted r == True)
346
347 mergeData :: Map HashId ReturnId
348 -> Map HashId a
349 -> [DocumentWithId a]
350 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
351 where
352 toDocumentWithId (sha,hpd) =
353 DocumentWithId <$> fmap reId (lookup sha rs)
354 <*> Just hpd
355
356 ------------------------------------------------------------------------
357
358 instance HasText HyperdataContact
359 where
360 hasText = undefined
361
362 instance ExtractNgramsT HyperdataContact
363 where
364 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
365 where
366 extract :: TermType Lang -> HyperdataContact
367 -> Cmd err (Map Ngrams (Map NgramsType Int))
368 extract _l hc' = do
369 let authors = map text2ngrams
370 $ maybe ["Nothing"] (\a -> [a])
371 $ view (hc_who . _Just . cw_lastName) hc'
372
373 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
374
375 instance HasText HyperdataDocument
376 where
377 hasText h = catMaybes [ _hyperdataDocument_title h
378 , _hyperdataDocument_abstract h
379 ]
380
381 instance ExtractNgramsT HyperdataDocument
382 where
383 extractNgramsT :: TermType Lang
384 -> HyperdataDocument
385 -> Cmd err (Map Ngrams (Map NgramsType Int))
386 extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
387 where
388 extractNgramsT' :: TermType Lang
389 -> HyperdataDocument
390 -> Cmd err (Map Ngrams (Map NgramsType Int))
391 extractNgramsT' lang' doc = do
392 let source = text2ngrams
393 $ maybe "Nothing" identity
394 $ _hyperdataDocument_source doc
395
396 institutes = map text2ngrams
397 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
398 $ _hyperdataDocument_institutes doc
399
400 authors = map text2ngrams
401 $ maybe ["Nothing"] (splitOn ", ")
402 $ _hyperdataDocument_authors doc
403
404 terms' <- map text2ngrams
405 <$> map (intercalate " " . _terms_label)
406 <$> concat
407 <$> liftIO (extractTerms lang' $ hasText doc)
408
409 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
410 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
411 <> [(a', Map.singleton Authors 1) | a' <- authors ]
412 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
413
414 filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
415 -> Map Ngrams (Map NgramsType Int)
416 filterNgramsT s ms = Map.fromList $ map (\a -> filter' s a) $ Map.toList ms
417 where
418 filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
419 True -> (ng,y)
420 False -> (Ngrams (Text.take s' t) n , y)
421
422
423 documentIdWithNgrams :: HasNodeError err
424 => (a
425 -> Cmd err (Map Ngrams (Map NgramsType Int)))
426 -> [DocumentWithId a]
427 -> Cmd err [DocumentIdWithNgrams a]
428 documentIdWithNgrams f = mapM toDocumentIdWithNgrams
429 where
430 toDocumentIdWithNgrams d = do
431 e <- f $ documentData d
432 pure $ DocumentIdWithNgrams d e
433