]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Flow.hs
[FIX] Flow force TextNodeId.
[gargantext.git] / src / Gargantext / Database / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10
11 -- TODO-ACCESS:
12 -- check userId CanFillUserCorpus userCorpusId
13 -- check masterUserId CanFillMasterCorpus masterCorpusId
14
15 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
16 -- TODO-EVENTS: InsertedNodes
17
18
19 -}
20
21 {-# OPTIONS_GHC -fno-warn-orphans #-}
22
23 {-# LANGUAGE ConstraintKinds #-}
24 {-# LANGUAGE RankNTypes #-}
25 {-# LANGUAGE ConstrainedClassMethods #-}
26 {-# LANGUAGE ConstraintKinds #-}
27 {-# LANGUAGE DeriveGeneric #-}
28 {-# LANGUAGE FlexibleContexts #-}
29 {-# LANGUAGE InstanceSigs #-}
30 {-# LANGUAGE NoImplicitPrelude #-}
31 {-# LANGUAGE OverloadedStrings #-}
32
33 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
34 where
35 import Prelude (String)
36 import Data.Either
37 import Debug.Trace (trace)
38 import Control.Lens ((^.), view, _Just)
39 import Control.Monad (mapM_)
40 import Control.Monad.IO.Class (liftIO)
41 import Data.List (concat)
42 import Data.Map (Map, lookup, toList)
43 import Data.Maybe (Maybe(..), catMaybes)
44 import Data.Monoid
45 import Data.Text (Text, splitOn, intercalate)
46 import GHC.Show (Show)
47 import Gargantext.API.Ngrams (HasRepoVar)
48 import Gargantext.API.Ngrams (NgramsElement(..), putListNgrams, RepoCmdM)
49 import Gargantext.Core (Lang(..))
50 import Gargantext.Core.Types (NodePoly(..), Terms(..))
51 import Gargantext.Core.Types.Individu (Username)
52 import Gargantext.Core.Flow
53 import Gargantext.Core.Types.Main
54 import Gargantext.Database.Config (userMaster, corpusMasterName)
55 import Gargantext.Database.Flow.Utils (insertDocNgrams)
56 import Gargantext.Database.Node.Contact -- (HyperdataContact(..), ContactWho(..))
57 import Gargantext.Database.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
58 import Gargantext.Database.Root (getRoot)
59 import Gargantext.Database.Schema.Ngrams -- (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
60 import Gargantext.Database.Schema.Node -- (mkRoot, mkCorpus, getOrMkList, mkGraph, mkPhylo, mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
61 import Gargantext.Database.Schema.User (getUser, UserLight(..))
62 import Gargantext.Database.TextSearch (searchInDatabase)
63 import Gargantext.Database.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
64 import Gargantext.Database.Utils (Cmd, CmdM)
65 import Gargantext.Ext.IMT (toSchoolName)
66 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
67 import Gargantext.Prelude
68 import Gargantext.Text.Terms.Eleve (buildTries, toToken)
69 import Gargantext.Text.List (buildNgramsLists,StopSize(..))
70 import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat)
71 import qualified Gargantext.Text.Corpus.API.Isidore as Isidore
72 import Gargantext.Text.Terms (TermType(..), tt_lang, extractTerms, uniText)
73 import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
74 import Gargantext.Prelude.Utils hiding (hash)
75 import System.FilePath (FilePath)
76 import qualified Data.List as List
77 import qualified Data.Map as Map
78 import qualified Data.Text as Text
79 import qualified Gargantext.Database.Node.Document.Add as Doc (add)
80 import qualified Gargantext.Text.Corpus.Parsers.GrandDebat as GD
81
82 type FlowCmdM env err m =
83 ( CmdM env err m
84 , RepoCmdM env err m
85 , HasNodeError err
86 , HasRepoVar env
87 )
88
89 ------------------------------------------------------------------------
90
91 data ApiQuery = ApiIsidoreQuery Text | ApiIsidoreAuth Text
92 -- | APIs
93 -- TODO instances
94 getDataApi :: Lang
95 -> Maybe Limit
96 -> ApiQuery
97 -> IO [HyperdataDocument]
98 getDataApi lang limit (ApiIsidoreQuery q) = Isidore.get lang limit (Just q) Nothing
99 getDataApi lang limit (ApiIsidoreAuth q) = Isidore.get lang limit Nothing (Just q)
100
101
102 flowCorpusApi :: ( FlowCmdM env err m)
103 => Username -> Either CorpusName [CorpusId]
104 -> TermType Lang
105 -> Maybe Limit
106 -> ApiQuery
107 -> m CorpusId
108 flowCorpusApi u n tt l q = do
109 docs <- liftIO $ splitEvery 500 <$> getDataApi (_tt_lang tt) l q
110 flowCorpus u n tt docs
111
112 ------------------------------------------------------------------------
113
114 flowAnnuaire :: FlowCmdM env err m
115 => Username -> Either CorpusName [CorpusId] -> (TermType Lang) -> FilePath -> m AnnuaireId
116 flowAnnuaire u n l filePath = do
117 docs <- liftIO $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
118 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
119
120
121 flowCorpusDebat :: FlowCmdM env err m
122 => Username -> Either CorpusName [CorpusId]
123 -> Limit -> FilePath
124 -> m CorpusId
125 flowCorpusDebat u n l fp = do
126 docs <- liftIO ( splitEvery 500
127 <$> take l
128 <$> readFile' fp
129 :: IO [[GD.GrandDebatReference ]]
130 )
131 flowCorpus u n (Multi FR) (map (map toHyperdataDocument) docs)
132
133 flowCorpusFile :: FlowCmdM env err m
134 => Username -> Either CorpusName [CorpusId]
135 -> Limit -- Limit the number of docs (for dev purpose)
136 -> TermType Lang -> FileFormat -> FilePath
137 -> m CorpusId
138 flowCorpusFile u n l la ff fp = do
139 docs <- liftIO ( splitEvery 500
140 <$> take l
141 <$> parseFile ff fp
142 )
143 flowCorpus u n la (map (map toHyperdataDocument) docs)
144
145 -- TODO query with complex query
146 flowCorpusSearchInDatabase :: FlowCmdM env err m
147 => Username -> Lang -> Text -> m CorpusId
148 flowCorpusSearchInDatabase u la q = do
149 (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus userMaster (Left "") (Nothing :: Maybe HyperdataCorpus)
150 ids <- map fst <$> searchInDatabase cId (stemIt q)
151 flowCorpusUser la u (Left q) (Nothing :: Maybe HyperdataCorpus) ids
152
153
154 flowCorpusSearchInDatabaseApi :: FlowCmdM env err m
155 => Username -> Lang -> Text -> m CorpusId
156 flowCorpusSearchInDatabaseApi u la q = do
157 (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus userMaster (Left "") (Nothing :: Maybe HyperdataCorpus)
158 ids <- map fst <$> searchInDatabase cId (stemIt q)
159 flowCorpusUser la u (Left q) (Nothing :: Maybe HyperdataCorpus) ids
160
161 ------------------------------------------------------------------------
162 -- | TODO improve the needed type to create/update a corpus
163 data UserInfo = Username Text
164 | UserId NodeId
165 data CorpusInfo = CorpusName Lang Text
166 | CorpusId Lang NodeId
167
168
169 flow :: (FlowCmdM env err m, FlowCorpus a, MkCorpus c)
170 => Maybe c -> Username -> Either CorpusName [CorpusId] -> TermType Lang -> [[a]] -> m CorpusId
171 flow c u cn la docs = do
172 ids <- mapM (insertMasterDocs c la ) docs
173 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
174
175 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
176 => Username -> Either CorpusName [CorpusId] -> TermType Lang -> [[a]] -> m CorpusId
177 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
178
179 ------------------------------------------------------------------------
180
181
182 flowCorpusUser :: (FlowCmdM env err m, MkCorpus c)
183 => Lang -> Username -> Either CorpusName [CorpusId] -> Maybe c -> [NodeId] -> m CorpusId
184 flowCorpusUser l userName corpusName ctype ids = do
185 -- User Flow
186 (userId, _rootId, userCorpusId) <- getOrMkRootWithCorpus userName corpusName ctype
187 -- TODO: check if present already, ignore
188 _ <- Doc.add userCorpusId ids
189 tId <- mkTexts userCorpusId userId
190 printDebug "Node Text Id" tId
191
192 -- User List Flow
193 --{-
194 (_masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster (Left "") ctype
195 ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
196 userListId <- flowList userId userCorpusId ngs
197 printDebug "userListId" userListId
198 -- User Graph Flow
199 _ <- mkDashboard userCorpusId userId
200 _ <- mkGraph userCorpusId userId
201 _ <- mkPhylo userCorpusId userId
202 --}
203
204
205 -- Annuaire Flow
206 -- _ <- mkAnnuaire rootUserId userId
207 pure userCorpusId
208
209
210 insertMasterDocs :: ( FlowCmdM env err m
211 , FlowCorpus a
212 , MkCorpus c
213 )
214 => Maybe c -> TermType Lang -> [a] -> m [DocId]
215 insertMasterDocs c lang hs = do
216 (masterUserId, _, masterCorpusId) <- getOrMkRootWithCorpus userMaster (Left corpusMasterName) c
217
218 -- TODO Type NodeDocumentUnicised
219 let hs' = map addUniqId hs
220 ids <- insertDb masterUserId masterCorpusId hs'
221 let documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' hs')
222
223 let
224 fixLang (Unsupervised l n s m) = Unsupervised l n s m'
225 where
226 m' = case m of
227 Nothing -> trace ("buildTries here" :: String)
228 $ Just
229 $ buildTries n ( fmap toToken $ uniText
230 $ Text.intercalate " . "
231 $ List.concat
232 $ map hasText documentsWithId
233 )
234 just_m -> just_m
235 fixLang l = l
236
237 lang' = fixLang lang
238 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
239 maps <- mapNodeIdNgrams <$> documentIdWithNgrams (extractNgramsT lang') documentsWithId
240 terms2id <- insertNgrams $ Map.keys maps
241 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) maps
242
243 lId <- getOrMkList masterCorpusId masterUserId
244 _ <- insertDocNgrams lId indexedNgrams
245 pure $ map reId ids
246
247
248
249 type CorpusName = Text
250
251 getOrMkRootWithCorpus :: (HasNodeError err, MkCorpus a)
252 => Username -> Either CorpusName [CorpusId] -> Maybe a
253 -> Cmd err (UserId, RootId, CorpusId)
254 getOrMkRootWithCorpus username cName c = do
255 maybeUserId <- getUser username
256 userId <- case maybeUserId of
257 Nothing -> nodeError NoUserFound
258 Just user -> pure $ userLight_id user
259
260 rootId' <- map _node_id <$> getRoot username
261
262 rootId'' <- case rootId' of
263 [] -> mkRoot username userId
264 n -> case length n >= 2 of
265 True -> nodeError ManyNodeUsers
266 False -> pure rootId'
267
268 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
269
270 corpusId'' <- if username == userMaster
271 then do
272 ns <- getCorporaWithParentId rootId
273 pure $ map _node_id ns
274 else
275 pure $ fromRight [] cName
276
277 corpusId' <- if corpusId'' /= []
278 then pure corpusId''
279 else mk (Just $ fromLeft "Default" cName) c rootId userId
280
281 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
282
283 pure (userId, rootId, corpusId)
284
285
286 ------------------------------------------------------------------------
287
288
289 viewUniqId' :: UniqId a => a -> (HashId, a)
290 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
291 where
292 err = panic "[ERROR] Database.Flow.toInsert"
293
294
295 toInserted :: [ReturnId] -> Map HashId ReturnId
296 toInserted = Map.fromList . map (\r -> (reUniqId r, r) )
297 . filter (\r -> reInserted r == True)
298
299 data DocumentWithId a = DocumentWithId
300 { documentId :: !NodeId
301 , documentData :: !a
302 } deriving (Show)
303
304 instance HasText a => HasText (DocumentWithId a)
305 where
306 hasText (DocumentWithId _ a) = hasText a
307
308 mergeData :: Map HashId ReturnId
309 -> Map HashId a
310 -> [DocumentWithId a]
311 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
312 where
313 toDocumentWithId (hash,hpd) =
314 DocumentWithId <$> fmap reId (lookup hash rs)
315 <*> Just hpd
316
317 ------------------------------------------------------------------------
318 data DocumentIdWithNgrams a = DocumentIdWithNgrams
319 { documentWithId :: !(DocumentWithId a)
320 , document_ngrams :: !(Map Ngrams (Map NgramsType Int))
321 } deriving (Show)
322
323
324 instance HasText HyperdataContact
325 where
326 hasText = undefined
327
328 instance ExtractNgramsT HyperdataContact
329 where
330 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
331 where
332 extract :: TermType Lang -> HyperdataContact
333 -> Cmd err (Map Ngrams (Map NgramsType Int))
334 extract _l hc' = do
335 let authors = map text2ngrams
336 $ maybe ["Nothing"] (\a -> [a])
337 $ view (hc_who . _Just . cw_lastName) hc'
338
339 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
340
341 instance HasText HyperdataDocument
342 where
343 hasText h = catMaybes [ _hyperdataDocument_title h
344 , _hyperdataDocument_abstract h
345 ]
346
347 instance ExtractNgramsT HyperdataDocument
348 where
349 extractNgramsT :: TermType Lang -> HyperdataDocument -> Cmd err (Map Ngrams (Map NgramsType Int))
350 extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
351 where
352 extractNgramsT' :: TermType Lang -> HyperdataDocument
353 -> Cmd err (Map Ngrams (Map NgramsType Int))
354 extractNgramsT' lang' doc = do
355 let source = text2ngrams
356 $ maybe "Nothing" identity
357 $ _hyperdataDocument_source doc
358
359 institutes = map text2ngrams
360 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
361 $ _hyperdataDocument_institutes doc
362
363 authors = map text2ngrams
364 $ maybe ["Nothing"] (splitOn ", ")
365 $ _hyperdataDocument_authors doc
366
367 terms' <- map text2ngrams
368 <$> map (intercalate " " . _terms_label)
369 <$> concat
370 <$> liftIO (extractTerms lang' $ hasText doc)
371
372 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
373 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
374 <> [(a', Map.singleton Authors 1) | a' <- authors ]
375 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
376
377
378 filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
379 -> Map Ngrams (Map NgramsType Int)
380 filterNgramsT s ms = Map.fromList $ map (\a -> filter' s a) $ Map.toList ms
381 where
382 filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
383 True -> (ng,y)
384 False -> (Ngrams (Text.take s' t) n , y)
385
386
387 documentIdWithNgrams :: HasNodeError err
388 => (a
389 -> Cmd err (Map Ngrams (Map NgramsType Int)))
390 -> [DocumentWithId a]
391 -> Cmd err [DocumentIdWithNgrams a]
392 documentIdWithNgrams f = mapM toDocumentIdWithNgrams
393 where
394 toDocumentIdWithNgrams d = do
395 e <- f $ documentData d
396 pure $ DocumentIdWithNgrams d e
397
398
399 -- FLOW LIST
400 -- | TODO check optimization
401 mapNodeIdNgrams :: [DocumentIdWithNgrams a]
402 -> Map Ngrams (Map NgramsType (Map NodeId Int))
403 mapNodeIdNgrams = Map.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
404 where
405 f :: DocumentIdWithNgrams a
406 -> Map Ngrams (Map NgramsType (Map NodeId Int))
407 f d = fmap (fmap (Map.singleton nId)) $ document_ngrams d
408 where
409 nId = documentId $ documentWithId d
410
411 ------------------------------------------------------------------------
412 listInsert :: FlowCmdM env err m
413 => ListId -> Map NgramsType [NgramsElement]
414 -> m ()
415 listInsert lId ngs = mapM_ (\(typeList, ngElmts)
416 -> putListNgrams lId typeList ngElmts
417 ) $ toList ngs
418
419 flowList :: FlowCmdM env err m => UserId -> CorpusId
420 -> Map NgramsType [NgramsElement]
421 -> m ListId
422 flowList uId cId ngs = do
423 lId <- getOrMkList cId uId
424 printDebug "listId flowList" lId
425 listInsert lId ngs
426 --trace (show $ List.filter (\n -> _ne_ngrams n == "versatile") $ List.concat $ Map.elems ngs) $ listInsert lId ngs
427 pure lId
428