]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Flow.hs
[FIX] lists size
[gargantext.git] / src / Gargantext / Database / Action / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -- TODO-ACCESS:
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
13
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
16 -}
17
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
19
20 {-# LANGUAGE ConstrainedClassMethods #-}
21 {-# LANGUAGE ConstraintKinds #-}
22 {-# LANGUAGE InstanceSigs #-}
23 {-# LANGUAGE ScopedTypeVariables #-}
24 {-# LANGUAGE TemplateHaskell #-}
25
26 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
27 ( FlowCmdM
28 , getDataText
29 , flowDataText
30 , flow
31
32 , flowCorpusFile
33 , flowCorpus
34 , flowAnnuaire
35 , insertMasterDocs
36 , saveDocNgramsWith
37
38 , getOrMkRoot
39 , getOrMk_RootWithCorpus
40 , TermType(..)
41 , DataOrigin(..)
42 , allDataOrigins
43
44 , do_api
45 , indexAllDocumentsWithPosTag
46 )
47 where
48
49 import Control.Lens ((^.), view, _Just, makeLenses)
50 import Data.Aeson.TH (deriveJSON)
51 import Data.Either
52 import Data.HashMap.Strict (HashMap)
53 import Data.Hashable (Hashable)
54 import Data.List (concat)
55 import Data.Map (Map, lookup)
56 import Data.Maybe (catMaybes)
57 import Data.Monoid
58 import Data.Swagger
59 import Data.Text (splitOn)
60 import Data.Traversable (traverse)
61 import Data.Tuple.Extra (first, second)
62 import GHC.Generics (Generic)
63 import System.FilePath (FilePath)
64 import qualified Data.HashMap.Strict as HashMap
65 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
66 import qualified Data.Map as Map
67
68 import Gargantext.Core (Lang(..), PosTagAlgo(..))
69 import Gargantext.Core.Ext.IMT (toSchoolName)
70 import Gargantext.Core.Ext.IMTUser (readFile_Annuaire)
71 import Gargantext.Core.Flow.Types
72 import Gargantext.Core.Text
73 import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
74 import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat)
75 import Gargantext.Core.Text.List (buildNgramsLists)
76 import Gargantext.Core.Text.Terms
77 import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
78 import Gargantext.Core.Types (POS(NP))
79 import Gargantext.Core.Types.Individu (User(..))
80 import Gargantext.Core.Types.Main
81 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
82 import Gargantext.Database.Action.Flow.List
83 import Gargantext.Database.Action.Flow.Types
84 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams, DocumentIdWithNgrams(..))
85 import Gargantext.Database.Action.Search (searchDocInDatabase)
86 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
87 import Gargantext.Database.Admin.Types.Hyperdata
88 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
89 import Gargantext.Database.Prelude
90 import Gargantext.Database.Query.Table.Ngrams
91 import Gargantext.Database.Query.Table.Node
92 import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
93 import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
94 import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
95 import Gargantext.Database.Query.Table.NodeNodeNgrams2
96 import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
97 import Gargantext.Database.Schema.Node (NodePoly(..), node_id)
98 import Gargantext.Database.Types
99 import Gargantext.Prelude
100 import Gargantext.Prelude.Crypto.Hash (Hash)
101 import qualified Gargantext.Core.Text.Corpus.API as API
102 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
103
104 ------------------------------------------------------------------------
105 -- Impots for upgrade function
106 import Gargantext.Database.Query.Tree.Root (getRootId)
107 import Gargantext.Database.Query.Tree (findNodesId)
108 import qualified Data.List as List
109 ------------------------------------------------------------------------
110 -- TODO use internal with API name (could be old data)
111 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
112 | ExternalOrigin { _do_api :: API.ExternalAPIs }
113 -- TODO Web
114 deriving (Generic, Eq)
115
116 makeLenses ''DataOrigin
117 deriveJSON (unPrefix "_do_") ''DataOrigin
118 instance ToSchema DataOrigin where
119 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
120
121 allDataOrigins :: [DataOrigin]
122 allDataOrigins = map InternalOrigin API.externalAPIs
123 <> map ExternalOrigin API.externalAPIs
124
125 ---------------
126 data DataText = DataOld ![NodeId]
127 | DataNew ![[HyperdataDocument]]
128
129 -- TODO use the split parameter in config file
130 getDataText :: FlowCmdM env err m
131 => DataOrigin
132 -> TermType Lang
133 -> API.Query
134 -> Maybe API.Limit
135 -> m DataText
136 getDataText (ExternalOrigin api) la q li = liftBase $ DataNew
137 <$> splitEvery 500
138 <$> API.get api (_tt_lang la) q li
139
140 getDataText (InternalOrigin _) _la q _li = do
141 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
142 (UserName userMaster)
143 (Left "")
144 (Nothing :: Maybe HyperdataCorpus)
145 ids <- map fst <$> searchDocInDatabase cId (stemIt q)
146 pure $ DataOld ids
147
148 -------------------------------------------------------------------------------
149 flowDataText :: ( FlowCmdM env err m
150 )
151 => User
152 -> DataText
153 -> TermType Lang
154 -> CorpusId
155 -> m CorpusId
156 flowDataText u (DataOld ids) tt cid = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids
157 where
158 corpusType = (Nothing :: Maybe HyperdataCorpus)
159 flowDataText u (DataNew txt) tt cid = flowCorpus u (Right [cid]) tt txt
160
161 ------------------------------------------------------------------------
162 -- TODO use proxy
163 flowAnnuaire :: (FlowCmdM env err m)
164 => User
165 -> Either CorpusName [CorpusId]
166 -> (TermType Lang)
167 -> FilePath
168 -> m AnnuaireId
169 flowAnnuaire u n l filePath = do
170 docs <- liftBase $ (( splitEvery 500 <$> readFile_Annuaire filePath) :: IO [[HyperdataContact]])
171 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
172
173 ------------------------------------------------------------------------
174 flowCorpusFile :: (FlowCmdM env err m)
175 => User
176 -> Either CorpusName [CorpusId]
177 -> Limit -- Limit the number of docs (for dev purpose)
178 -> TermType Lang -> FileFormat -> FilePath
179 -> m CorpusId
180 flowCorpusFile u n l la ff fp = do
181 docs <- liftBase ( splitEvery 500
182 <$> take l
183 <$> parseFile ff fp
184 )
185 flowCorpus u n la (map (map toHyperdataDocument) docs)
186
187 ------------------------------------------------------------------------
188 -- | TODO improve the needed type to create/update a corpus
189 -- (For now, Either is enough)
190 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
191 => User
192 -> Either CorpusName [CorpusId]
193 -> TermType Lang
194 -> [[a]]
195 -> m CorpusId
196 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
197
198
199 flow :: ( FlowCmdM env err m
200 , FlowCorpus a
201 , MkCorpus c
202 )
203 => Maybe c
204 -> User
205 -> Either CorpusName [CorpusId]
206 -> TermType Lang
207 -> [[a]]
208 -> m CorpusId
209 flow c u cn la docs = do
210 -- TODO if public insertMasterDocs else insertUserDocs
211 ids <- traverse (insertMasterDocs c la) docs
212 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
213
214 ------------------------------------------------------------------------
215 flowCorpusUser :: ( FlowCmdM env err m
216 , MkCorpus c
217 )
218 => Lang
219 -> User
220 -> Either CorpusName [CorpusId]
221 -> Maybe c
222 -> [NodeId]
223 -> m CorpusId
224 flowCorpusUser l user corpusName ctype ids = do
225 -- User Flow
226 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
227 -- NodeTexts is first
228 _tId <- insertDefaultNode NodeTexts userCorpusId userId
229 -- printDebug "NodeTexts: " tId
230
231 -- NodeList is second
232 listId <- getOrMkList userCorpusId userId
233 -- _cooc <- insertDefaultNode NodeListCooc listId userId
234 -- TODO: check if present already, ignore
235 _ <- Doc.add userCorpusId ids
236
237 -- printDebug "Node Text Ids:" tId
238
239 -- User List Flow
240 (masterUserId, _masterRootId, masterCorpusId)
241 <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
242
243 --let gp = (GroupParams l 2 3 (StopSize 3))
244 let gp = GroupWithPosTag l CoreNLP HashMap.empty
245 ngs <- buildNgramsLists gp user userCorpusId masterCorpusId
246
247 _userListId <- flowList_DbRepo listId ngs
248 _mastListId <- getOrMkList masterCorpusId masterUserId
249 -- _ <- insertOccsUpdates userCorpusId mastListId
250 -- printDebug "userListId" userListId
251 -- User Graph Flow
252 _ <- insertDefaultNode NodeDashboard userCorpusId userId
253 _ <- insertDefaultNode NodeGraph userCorpusId userId
254 --_ <- mkPhylo userCorpusId userId
255 -- Annuaire Flow
256 -- _ <- mkAnnuaire rootUserId userId
257 pure userCorpusId
258
259
260 insertMasterDocs :: ( FlowCmdM env err m
261 , FlowCorpus a
262 , MkCorpus c
263 )
264 => Maybe c
265 -> TermType Lang
266 -> [a]
267 -> m [DocId]
268 insertMasterDocs c lang hs = do
269 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
270 (ids', documentsWithId) <- insertDocs masterUserId masterCorpusId (map (toNode masterUserId masterCorpusId) hs )
271 _ <- Doc.add masterCorpusId ids'
272 -- TODO
273 -- create a corpus with database name (CSV or PubMed)
274 -- add documents to the corpus (create node_node link)
275 -- this will enable global database monitoring
276
277 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
278 mapNgramsDocs' :: HashMap ExtractedNgrams (Map NgramsType (Map NodeId Int))
279 <- mapNodeIdNgrams
280 <$> documentIdWithNgrams
281 (extractNgramsT $ withLang lang documentsWithId)
282 documentsWithId
283
284 lId <- getOrMkList masterCorpusId masterUserId
285 _ <- saveDocNgramsWith lId mapNgramsDocs'
286
287 -- _cooc <- insertDefaultNode NodeListCooc lId masterUserId
288 pure ids'
289
290 saveDocNgramsWith :: ( FlowCmdM env err m)
291 => ListId
292 -> HashMap ExtractedNgrams (Map NgramsType (Map NodeId Int))
293 -> m ()
294 saveDocNgramsWith lId mapNgramsDocs' = do
295 terms2id <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
296 let mapNgramsDocs = HashMap.mapKeys extracted2ngrams mapNgramsDocs'
297
298 -- to be removed
299 let indexedNgrams = HashMap.mapKeys (indexNgrams terms2id) mapNgramsDocs
300
301 -- new
302 mapCgramsId <- listInsertDb lId toNodeNgramsW'
303 $ map (first _ngramsTerms . second Map.keys)
304 $ HashMap.toList mapNgramsDocs
305
306 -- insertDocNgrams
307 _return <- insertNodeNodeNgrams2
308 $ catMaybes [ NodeNodeNgrams2 <$> Just nId
309 <*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms'')
310 <*> Just (fromIntegral w :: Double)
311 | (terms'', mapNgramsTypes) <- HashMap.toList mapNgramsDocs
312 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
313 , (nId, w) <- Map.toList mapNodeIdWeight
314 ]
315 -- to be removed
316 _ <- insertDocNgrams lId indexedNgrams
317
318 pure ()
319
320
321 ------------------------------------------------------------------------
322 -- TODO Type NodeDocumentUnicised
323 insertDocs :: ( FlowCmdM env err m
324 -- , FlowCorpus a
325 , FlowInsertDB a
326 )
327 => UserId
328 -> CorpusId
329 -> [a]
330 -> m ([DocId], [Indexed NodeId a])
331 insertDocs uId cId hs = do
332 let docs = map addUniqId hs
333 newIds <- insertDb uId cId docs
334 -- printDebug "newIds" newIds
335 let
336 newIds' = map reId newIds
337 documentsWithId = mergeData (toInserted newIds) (Map.fromList $ map viewUniqId' docs)
338 _ <- Doc.add cId newIds'
339 pure (newIds', documentsWithId)
340
341
342 ------------------------------------------------------------------------
343 viewUniqId' :: UniqId a
344 => a
345 -> (Hash, a)
346 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
347 where
348 err = panic "[ERROR] Database.Flow.toInsert"
349
350
351 toInserted :: [ReturnId]
352 -> Map Hash ReturnId
353 toInserted =
354 Map.fromList . map (\r -> (reUniqId r, r) )
355 . filter (\r -> reInserted r == True)
356
357 mergeData :: Map Hash ReturnId
358 -> Map Hash a
359 -> [Indexed NodeId a]
360 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
361 where
362 toDocumentWithId (sha,hpd) =
363 Indexed <$> fmap reId (lookup sha rs)
364 <*> Just hpd
365
366 ------------------------------------------------------------------------
367 ------------------------------------------------------------------------
368 ------------------------------------------------------------------------
369 documentIdWithNgrams :: HasNodeError err
370 => (a
371 -> Cmd err (HashMap b (Map NgramsType Int)))
372 -> [Indexed NodeId a]
373 -> Cmd err [DocumentIdWithNgrams a b]
374 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
375 where
376 toDocumentIdWithNgrams d = do
377 e <- f $ _unIndex d
378 pure $ DocumentIdWithNgrams d e
379
380
381 -- | TODO check optimization
382 mapNodeIdNgrams :: (Ord b, Hashable b)
383 => [DocumentIdWithNgrams a b]
384 -> HashMap b
385 (Map NgramsType
386 (Map NodeId Int)
387 )
388 mapNodeIdNgrams = HashMap.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
389 where
390 f :: DocumentIdWithNgrams a b
391 -> HashMap b (Map NgramsType (Map NodeId Int))
392 f d = fmap (fmap (Map.singleton nId)) $ documentNgrams d
393 where
394 nId = _index $ documentWithId d
395
396
397 ------------------------------------------------------------------------
398 instance ExtractNgramsT HyperdataContact
399 where
400 extractNgramsT l hc = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extract l hc
401 where
402 extract :: TermType Lang -> HyperdataContact
403 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
404 extract _l hc' = do
405 let authors = map text2ngrams
406 $ maybe ["Nothing"] (\a -> [a])
407 $ view (hc_who . _Just . cw_lastName) hc'
408
409 pure $ HashMap.fromList $ [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ]
410
411
412 instance ExtractNgramsT HyperdataDocument
413 where
414 extractNgramsT :: TermType Lang
415 -> HyperdataDocument
416 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
417 extractNgramsT lang hd = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extractNgramsT' lang hd
418 where
419 extractNgramsT' :: TermType Lang
420 -> HyperdataDocument
421 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
422 extractNgramsT' lang' doc = do
423 let source = text2ngrams
424 $ maybe "Nothing" identity
425 $ _hd_source doc
426
427 institutes = map text2ngrams
428 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
429 $ _hd_institutes doc
430
431 authors = map text2ngrams
432 $ maybe ["Nothing"] (splitOn ", ")
433 $ _hd_authors doc
434
435 terms' <- map (enrichedTerms (lang' ^. tt_lang) CoreNLP NP)
436 <$> concat
437 <$> liftBase (extractTerms lang' $ hasText doc)
438
439 pure $ HashMap.fromList
440 $ [(SimpleNgrams source, Map.singleton Sources 1) ]
441 <> [(SimpleNgrams i', Map.singleton Institutes 1) | i' <- institutes ]
442 <> [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ]
443 <> [(EnrichedNgrams t', Map.singleton NgramsTerms 1) | t' <- terms' ]
444
445 instance (ExtractNgramsT a, HasText a) => ExtractNgramsT (Node a)
446 where
447 extractNgramsT l (Node _ _ _ _ _ _ _ h) = extractNgramsT l h
448
449 instance HasText a => HasText (Node a)
450 where
451 hasText (Node _ _ _ _ _ _ _ h) = hasText h
452
453
454
455 -- | TODO putelsewhere
456 -- | Upgrade function
457 -- Suppose all documents are English (this is the case actually)
458 indexAllDocumentsWithPosTag :: FlowCmdM env err m => m ()
459 indexAllDocumentsWithPosTag = do
460 rootId <- getRootId (UserName userMaster)
461 corpusIds <- findNodesId rootId [NodeCorpus]
462 docs <- List.concat <$> mapM getDocumentsWithParentId corpusIds
463
464 _ <- mapM extractInsert (splitEvery 1000 docs)
465
466 pure ()
467
468 extractInsert :: FlowCmdM env err m => [Node HyperdataDocument] -> m ()
469 extractInsert docs = do
470 let documentsWithId = map (\doc -> Indexed (doc ^. node_id) doc) docs
471
472 mapNgramsDocs' <- mapNodeIdNgrams
473 <$> documentIdWithNgrams
474 (extractNgramsT $ withLang (Multi EN) documentsWithId)
475 documentsWithId
476
477 _ <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
478
479 pure ()
480
481