]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Flow.hs
fix the weightedlogjaccard
[gargantext.git] / src / Gargantext / Database / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10
11 -- TODO-ACCESS:
12 -- check userId CanFillUserCorpus userCorpusId
13 -- check masterUserId CanFillMasterCorpus masterCorpusId
14
15 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
16 -- TODO-EVENTS: InsertedNodes
17
18
19 -}
20
21 {-# OPTIONS_GHC -fno-warn-orphans #-}
22
23 {-# LANGUAGE ConstraintKinds #-}
24 {-# LANGUAGE RankNTypes #-}
25 {-# LANGUAGE ConstrainedClassMethods #-}
26 {-# LANGUAGE ConstraintKinds #-}
27 {-# LANGUAGE DeriveGeneric #-}
28 {-# LANGUAGE FlexibleContexts #-}
29 {-# LANGUAGE InstanceSigs #-}
30 {-# LANGUAGE NoImplicitPrelude #-}
31 {-# LANGUAGE OverloadedStrings #-}
32
33 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
34 where
35 import Prelude (String)
36 import Data.Either
37 import Debug.Trace (trace)
38 import Control.Lens ((^.), view, _Just)
39 import Control.Monad (mapM_)
40 import Control.Monad.IO.Class (liftIO)
41 import Data.List (concat)
42 import Data.Map (Map, lookup, toList)
43 import Data.Maybe (Maybe(..), catMaybes)
44 import Data.Monoid
45 import Data.Text (Text, splitOn, intercalate)
46 import GHC.Show (Show)
47 import Gargantext.API.Ngrams (HasRepoVar)
48 import Gargantext.API.Ngrams (NgramsElement(..), putListNgrams, RepoCmdM)
49 import Gargantext.Core (Lang(..))
50 import Gargantext.Core.Types (NodePoly(..), Terms(..))
51 import Gargantext.Core.Types.Individu (Username)
52 import Gargantext.Core.Flow
53 import Gargantext.Core.Types.Main
54 import Gargantext.Database.Config (userMaster, corpusMasterName)
55 import Gargantext.Database.Flow.Utils (insertDocNgrams)
56 import Gargantext.Database.Node.Contact -- (HyperdataContact(..), ContactWho(..))
57 import Gargantext.Database.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
58 import Gargantext.Database.Root (getRoot)
59 import Gargantext.Database.Schema.Ngrams -- (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
60 import Gargantext.Database.Schema.Node -- (mkRoot, mkCorpus, getOrMkList, mkGraph, {-mkPhylo,-} mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
61 import Gargantext.Database.Schema.User (getUser, UserLight(..))
62 import Gargantext.Database.TextSearch (searchInDatabase)
63 import Gargantext.Database.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
64 import Gargantext.Database.Utils (Cmd, CmdM)
65 import Gargantext.Ext.IMT (toSchoolName)
66 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
67 import Gargantext.Prelude
68 import Gargantext.Text.Terms.Eleve (buildTries, toToken)
69 import Gargantext.Text.List (buildNgramsLists,StopSize(..))
70 import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat)
71 import qualified Gargantext.Text.Corpus.API.Isidore as Isidore
72 import Gargantext.Text.Terms (TermType(..), tt_lang, extractTerms, uniText)
73 import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
74 import Gargantext.Prelude.Utils hiding (hash)
75 import System.FilePath (FilePath)
76 import qualified Data.List as List
77 import qualified Data.Map as Map
78 import qualified Data.Text as Text
79 import qualified Gargantext.Database.Node.Document.Add as Doc (add)
80 import qualified Gargantext.Text.Corpus.Parsers.GrandDebat as GD
81
82 type FlowCmdM env err m =
83 ( CmdM env err m
84 , RepoCmdM env err m
85 , HasNodeError err
86 , HasRepoVar env
87 )
88
89 ------------------------------------------------------------------------
90
91 data ApiQuery = ApiIsidoreQuery Text | ApiIsidoreAuth Text
92 -- | APIs
93 -- TODO instances
94 getDataApi :: Lang
95 -> Maybe Limit
96 -> ApiQuery
97 -> IO [HyperdataDocument]
98 getDataApi lang limit (ApiIsidoreQuery q) = Isidore.get lang limit (Just q) Nothing
99 getDataApi lang limit (ApiIsidoreAuth q) = Isidore.get lang limit Nothing (Just q)
100
101
102 flowCorpusApi :: ( FlowCmdM env err m)
103 => Username -> Either CorpusName [CorpusId]
104 -> TermType Lang
105 -> Maybe Limit
106 -> ApiQuery
107 -> m CorpusId
108 flowCorpusApi u n tt l q = do
109 docs <- liftIO $ splitEvery 500 <$> getDataApi (_tt_lang tt) l q
110 flowCorpus u n tt docs
111
112 ------------------------------------------------------------------------
113
114 flowAnnuaire :: FlowCmdM env err m
115 => Username -> Either CorpusName [CorpusId] -> (TermType Lang) -> FilePath -> m AnnuaireId
116 flowAnnuaire u n l filePath = do
117 docs <- liftIO $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
118 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
119
120
121 flowCorpusDebat :: FlowCmdM env err m
122 => Username -> Either CorpusName [CorpusId]
123 -> Limit -> FilePath
124 -> m CorpusId
125 flowCorpusDebat u n l fp = do
126 docs <- liftIO ( splitEvery 500
127 <$> take l
128 <$> readFile' fp
129 :: IO [[GD.GrandDebatReference ]]
130 )
131 flowCorpus u n (Multi FR) (map (map toHyperdataDocument) docs)
132
133 flowCorpusFile :: FlowCmdM env err m
134 => Username -> Either CorpusName [CorpusId]
135 -> Limit -- Limit the number of docs (for dev purpose)
136 -> TermType Lang -> FileFormat -> FilePath
137 -> m CorpusId
138 flowCorpusFile u n l la ff fp = do
139 docs <- liftIO ( splitEvery 500
140 <$> take l
141 <$> parseFile ff fp
142 )
143 flowCorpus u n la (map (map toHyperdataDocument) docs)
144
145 -- TODO query with complex query
146 flowCorpusSearchInDatabase :: FlowCmdM env err m
147 => Username -> Lang -> Text -> m CorpusId
148 flowCorpusSearchInDatabase u la q = do
149 (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus userMaster (Left "") (Nothing :: Maybe HyperdataCorpus)
150 ids <- map fst <$> searchInDatabase cId (stemIt q)
151 flowCorpusUser la u (Left q) (Nothing :: Maybe HyperdataCorpus) ids
152
153
154 flowCorpusSearchInDatabaseApi :: FlowCmdM env err m
155 => Username -> Lang -> Text -> m CorpusId
156 flowCorpusSearchInDatabaseApi u la q = do
157 (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus userMaster (Left "") (Nothing :: Maybe HyperdataCorpus)
158 ids <- map fst <$> searchInDatabase cId (stemIt q)
159 flowCorpusUser la u (Left q) (Nothing :: Maybe HyperdataCorpus) ids
160
161 ------------------------------------------------------------------------
162 -- | TODO improve the needed type to create/update a corpus
163 data UserInfo = Username Text
164 | UserId NodeId
165 data CorpusInfo = CorpusName Lang Text
166 | CorpusId Lang NodeId
167
168
169 flow :: (FlowCmdM env err m, FlowCorpus a, MkCorpus c)
170 => Maybe c -> Username -> Either CorpusName [CorpusId] -> TermType Lang -> [[a]] -> m CorpusId
171 flow c u cn la docs = do
172 ids <- mapM (insertMasterDocs c la ) docs
173 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
174
175 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
176 => Username -> Either CorpusName [CorpusId] -> TermType Lang -> [[a]] -> m CorpusId
177 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
178
179 ------------------------------------------------------------------------
180 flowCorpusUser :: (FlowCmdM env err m, MkCorpus c)
181 => Lang -> Username -> Either CorpusName [CorpusId] -> Maybe c -> [NodeId] -> m CorpusId
182 flowCorpusUser l userName corpusName ctype ids = do
183 -- User Flow
184 (userId, _rootId, userCorpusId) <- getOrMkRootWithCorpus userName corpusName ctype
185 -- TODO: check if present already, ignore
186 _ <- Doc.add userCorpusId ids
187 tId <- mkNode NodeTexts userCorpusId userId
188
189 printDebug "Node Text Id" tId
190
191 -- User List Flow
192 --{-
193 (_masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster (Left "") ctype
194 ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
195 userListId <- flowList userId userCorpusId ngs
196 printDebug "userListId" userListId
197 -- User Graph Flow
198 _ <- mkDashboard userCorpusId userId
199 _ <- mkGraph userCorpusId userId
200 --_ <- mkPhylo userCorpusId userId
201 --}
202
203
204 -- Annuaire Flow
205 -- _ <- mkAnnuaire rootUserId userId
206 pure userCorpusId
207
208
209 insertMasterDocs :: ( FlowCmdM env err m
210 , FlowCorpus a
211 , MkCorpus c
212 )
213 => Maybe c -> TermType Lang -> [a] -> m [DocId]
214 insertMasterDocs c lang hs = do
215 (masterUserId, _, masterCorpusId) <- getOrMkRootWithCorpus userMaster (Left corpusMasterName) c
216
217 -- TODO Type NodeDocumentUnicised
218 let hs' = map addUniqId hs
219 ids <- insertDb masterUserId masterCorpusId hs'
220 let documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' hs')
221
222 let
223 fixLang (Unsupervised l n s m) = Unsupervised l n s m'
224 where
225 m' = case m of
226 Nothing -> trace ("buildTries here" :: String)
227 $ Just
228 $ buildTries n ( fmap toToken $ uniText
229 $ Text.intercalate " . "
230 $ List.concat
231 $ map hasText documentsWithId
232 )
233 just_m -> just_m
234 fixLang l = l
235
236 lang' = fixLang lang
237 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
238 maps <- mapNodeIdNgrams <$> documentIdWithNgrams (extractNgramsT lang') documentsWithId
239 terms2id <- insertNgrams $ Map.keys maps
240 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) maps
241
242 lId <- getOrMkList masterCorpusId masterUserId
243 _ <- insertDocNgrams lId indexedNgrams
244 pure $ map reId ids
245
246
247 type CorpusName = Text
248
249
250 getOrMkRoot :: (HasNodeError err) => Username -> Cmd err (UserId, RootId)
251 getOrMkRoot username = do
252 maybeUserId <- getUser username
253 userId <- case maybeUserId of
254 Nothing -> nodeError NoUserFound
255 Just user -> pure $ userLight_id user
256
257 rootId' <- map _node_id <$> getRoot username
258
259 rootId'' <- case rootId' of
260 [] -> mkRoot username userId
261 n -> case length n >= 2 of
262 True -> nodeError ManyNodeUsers
263 False -> pure rootId'
264
265 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
266 pure (userId, rootId)
267
268
269 getOrMkRootWithCorpus :: (HasNodeError err, MkCorpus a)
270 => Username -> Either CorpusName [CorpusId] -> Maybe a
271 -> Cmd err (UserId, RootId, CorpusId)
272 getOrMkRootWithCorpus username cName c = do
273 (userId, rootId) <- getOrMkRoot username
274 corpusId'' <- if username == userMaster
275 then do
276 ns <- getCorporaWithParentId rootId
277 pure $ map _node_id ns
278 else
279 pure $ fromRight [] cName
280
281 corpusId' <- if corpusId'' /= []
282 then pure corpusId''
283 else mk (Just $ fromLeft "Default" cName) c rootId userId
284
285 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
286
287 pure (userId, rootId, corpusId)
288
289
290 ------------------------------------------------------------------------
291 viewUniqId' :: UniqId a => a -> (HashId, a)
292 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
293 where
294 err = panic "[ERROR] Database.Flow.toInsert"
295
296
297 toInserted :: [ReturnId] -> Map HashId ReturnId
298 toInserted = Map.fromList . map (\r -> (reUniqId r, r) )
299 . filter (\r -> reInserted r == True)
300
301 data DocumentWithId a = DocumentWithId
302 { documentId :: !NodeId
303 , documentData :: !a
304 } deriving (Show)
305
306 instance HasText a => HasText (DocumentWithId a)
307 where
308 hasText (DocumentWithId _ a) = hasText a
309
310 mergeData :: Map HashId ReturnId
311 -> Map HashId a
312 -> [DocumentWithId a]
313 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
314 where
315 toDocumentWithId (hash,hpd) =
316 DocumentWithId <$> fmap reId (lookup hash rs)
317 <*> Just hpd
318
319 ------------------------------------------------------------------------
320 data DocumentIdWithNgrams a = DocumentIdWithNgrams
321 { documentWithId :: !(DocumentWithId a)
322 , document_ngrams :: !(Map Ngrams (Map NgramsType Int))
323 } deriving (Show)
324
325
326 instance HasText HyperdataContact
327 where
328 hasText = undefined
329
330 instance ExtractNgramsT HyperdataContact
331 where
332 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
333 where
334 extract :: TermType Lang -> HyperdataContact
335 -> Cmd err (Map Ngrams (Map NgramsType Int))
336 extract _l hc' = do
337 let authors = map text2ngrams
338 $ maybe ["Nothing"] (\a -> [a])
339 $ view (hc_who . _Just . cw_lastName) hc'
340
341 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
342
343 instance HasText HyperdataDocument
344 where
345 hasText h = catMaybes [ _hyperdataDocument_title h
346 , _hyperdataDocument_abstract h
347 ]
348
349 instance ExtractNgramsT HyperdataDocument
350 where
351 extractNgramsT :: TermType Lang -> HyperdataDocument -> Cmd err (Map Ngrams (Map NgramsType Int))
352 extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
353 where
354 extractNgramsT' :: TermType Lang -> HyperdataDocument
355 -> Cmd err (Map Ngrams (Map NgramsType Int))
356 extractNgramsT' lang' doc = do
357 let source = text2ngrams
358 $ maybe "Nothing" identity
359 $ _hyperdataDocument_source doc
360
361 institutes = map text2ngrams
362 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
363 $ _hyperdataDocument_institutes doc
364
365 authors = map text2ngrams
366 $ maybe ["Nothing"] (splitOn ", ")
367 $ _hyperdataDocument_authors doc
368
369 terms' <- map text2ngrams
370 <$> map (intercalate " " . _terms_label)
371 <$> concat
372 <$> liftIO (extractTerms lang' $ hasText doc)
373
374 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
375 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
376 <> [(a', Map.singleton Authors 1) | a' <- authors ]
377 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
378
379
380 filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
381 -> Map Ngrams (Map NgramsType Int)
382 filterNgramsT s ms = Map.fromList $ map (\a -> filter' s a) $ Map.toList ms
383 where
384 filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
385 True -> (ng,y)
386 False -> (Ngrams (Text.take s' t) n , y)
387
388
389 documentIdWithNgrams :: HasNodeError err
390 => (a
391 -> Cmd err (Map Ngrams (Map NgramsType Int)))
392 -> [DocumentWithId a]
393 -> Cmd err [DocumentIdWithNgrams a]
394 documentIdWithNgrams f = mapM toDocumentIdWithNgrams
395 where
396 toDocumentIdWithNgrams d = do
397 e <- f $ documentData d
398 pure $ DocumentIdWithNgrams d e
399
400
401 -- FLOW LIST
402 -- | TODO check optimization
403 mapNodeIdNgrams :: [DocumentIdWithNgrams a]
404 -> Map Ngrams (Map NgramsType (Map NodeId Int))
405 mapNodeIdNgrams = Map.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
406 where
407 f :: DocumentIdWithNgrams a
408 -> Map Ngrams (Map NgramsType (Map NodeId Int))
409 f d = fmap (fmap (Map.singleton nId)) $ document_ngrams d
410 where
411 nId = documentId $ documentWithId d
412
413 ------------------------------------------------------------------------
414 listInsert :: FlowCmdM env err m
415 => ListId -> Map NgramsType [NgramsElement]
416 -> m ()
417 listInsert lId ngs = mapM_ (\(typeList, ngElmts)
418 -> putListNgrams lId typeList ngElmts
419 ) $ toList ngs
420
421 flowList :: FlowCmdM env err m => UserId -> CorpusId
422 -> Map NgramsType [NgramsElement]
423 -> m ListId
424 flowList uId cId ngs = do
425 lId <- getOrMkList cId uId
426 printDebug "listId flowList" lId
427 listInsert lId ngs
428 --trace (show $ List.filter (\n -> _ne_ngrams n == "versatile") $ List.concat $ Map.elems ngs) $ listInsert lId ngs
429 pure lId
430