]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Flow.hs
[TOOLING] sql queries to remove unused documents and ngrams from database. Usage...
[gargantext.git] / src / Gargantext / Database / Action / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -- TODO-ACCESS:
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
13
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
16 -}
17
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
19
20 {-# LANGUAGE ConstrainedClassMethods #-}
21 {-# LANGUAGE ConstraintKinds #-}
22 {-# LANGUAGE InstanceSigs #-}
23 {-# LANGUAGE ScopedTypeVariables #-}
24 {-# LANGUAGE TemplateHaskell #-}
25
26 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
27 ( DataText(..)
28 , getDataText
29 , flowDataText
30 , flow
31
32 , flowCorpusFile
33 , flowCorpus
34 , flowAnnuaire
35 , insertMasterDocs
36 , saveDocNgramsWith
37
38 , getOrMkRoot
39 , getOrMk_RootWithCorpus
40 , TermType(..)
41 , DataOrigin(..)
42 , allDataOrigins
43
44 , do_api
45 , indexAllDocumentsWithPosTag
46 )
47 where
48
49 import Control.Lens ((^.), view, _Just, makeLenses)
50 import Data.Aeson.TH (deriveJSON)
51 import Data.Either
52 import Data.HashMap.Strict (HashMap)
53 import Data.Hashable (Hashable)
54 import Data.List (concat)
55 import Data.Map (Map, lookup)
56 import Data.Maybe (catMaybes)
57 import Data.Monoid
58 import Data.Swagger
59 import qualified Data.Text as T
60 import Data.Traversable (traverse)
61 import Data.Tuple.Extra (first, second)
62 import GHC.Generics (Generic)
63 import System.FilePath (FilePath)
64 import qualified Data.HashMap.Strict as HashMap
65 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
66 import qualified Data.Map as Map
67
68 import Gargantext.Core (Lang(..), PosTagAlgo(..))
69 import Gargantext.Core.Ext.IMT (toSchoolName)
70 import Gargantext.Core.Ext.IMTUser (readFile_Annuaire)
71 import Gargantext.Core.Flow.Types
72 import Gargantext.Core.Text
73 import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
74 import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat)
75 import Gargantext.Core.Text.List (buildNgramsLists)
76 import Gargantext.Core.Text.List.Social (FlowSocialListWith)
77 import Gargantext.Core.Text.Terms
78 import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
79 import Gargantext.Core.Types (POS(NP))
80 import Gargantext.Core.Types.Individu (User(..))
81 import Gargantext.Core.Types.Main
82 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
83 import Gargantext.Database.Action.Flow.List
84 import Gargantext.Database.Action.Flow.Types
85 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams, DocumentIdWithNgrams(..))
86 import Gargantext.Database.Action.Search (searchDocInDatabase)
87 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
88 import Gargantext.Database.Admin.Types.Hyperdata
89 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
90 import Gargantext.Database.Prelude
91 import Gargantext.Database.Query.Table.Ngrams
92 import Gargantext.Database.Query.Table.Node
93 import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
94 import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
95 import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
96 import Gargantext.Database.Query.Table.NodeNodeNgrams2
97 import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
98 import Gargantext.Database.Schema.Node (NodePoly(..), node_id)
99 import Gargantext.Database.Types
100 import Gargantext.Prelude
101 import Gargantext.Prelude.Crypto.Hash (Hash)
102 import qualified Gargantext.Core.Text.Corpus.API as API
103 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
104
105 ------------------------------------------------------------------------
106 -- Imports for upgrade function
107 import Gargantext.Database.Query.Tree.Root (getRootId)
108 import Gargantext.Database.Query.Tree (findNodesId)
109 import qualified Data.List as List
110 ------------------------------------------------------------------------
111 -- TODO use internal with API name (could be old data)
112 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
113 | ExternalOrigin { _do_api :: API.ExternalAPIs }
114 -- TODO Web
115 deriving (Generic, Eq)
116
117 makeLenses ''DataOrigin
118 deriveJSON (unPrefix "_do_") ''DataOrigin
119 instance ToSchema DataOrigin where
120 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
121
122 allDataOrigins :: [DataOrigin]
123 allDataOrigins = map InternalOrigin API.externalAPIs
124 <> map ExternalOrigin API.externalAPIs
125
126 ---------------
127 data DataText = DataOld ![NodeId]
128 | DataNew ![[HyperdataDocument]]
129
130 -- TODO use the split parameter in config file
131 getDataText :: FlowCmdM env err m
132 => DataOrigin
133 -> TermType Lang
134 -> API.Query
135 -> Maybe API.Limit
136 -> m DataText
137 getDataText (ExternalOrigin api) la q li = liftBase $ DataNew
138 <$> splitEvery 500
139 <$> API.get api (_tt_lang la) q li
140
141 getDataText (InternalOrigin _) _la q _li = do
142 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
143 (UserName userMaster)
144 (Left "")
145 (Nothing :: Maybe HyperdataCorpus)
146 ids <- map fst <$> searchDocInDatabase cId (stemIt q)
147 pure $ DataOld ids
148
149 -------------------------------------------------------------------------------
150 flowDataText :: ( FlowCmdM env err m
151 )
152 => User
153 -> DataText
154 -> TermType Lang
155 -> CorpusId
156 -> Maybe FlowSocialListWith
157 -> m CorpusId
158 flowDataText u (DataOld ids) tt cid mfslw = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids mfslw
159 where
160 corpusType = (Nothing :: Maybe HyperdataCorpus)
161 flowDataText u (DataNew txt) tt cid mfslw = flowCorpus u (Right [cid]) tt mfslw txt
162
163 ------------------------------------------------------------------------
164 -- TODO use proxy
165 flowAnnuaire :: (FlowCmdM env err m)
166 => User
167 -> Either CorpusName [CorpusId]
168 -> (TermType Lang)
169 -> FilePath
170 -> m AnnuaireId
171 flowAnnuaire u n l filePath = do
172 docs <- liftBase $ (( splitEvery 500 <$> readFile_Annuaire filePath) :: IO [[HyperdataContact]])
173 flow (Nothing :: Maybe HyperdataAnnuaire) u n l Nothing docs
174
175 ------------------------------------------------------------------------
176 flowCorpusFile :: (FlowCmdM env err m)
177 => User
178 -> Either CorpusName [CorpusId]
179 -> Limit -- Limit the number of docs (for dev purpose)
180 -> TermType Lang -> FileFormat -> FilePath
181 -> Maybe FlowSocialListWith
182 -> m CorpusId
183 flowCorpusFile u n l la ff fp mfslw = do
184 eParsed <- liftBase $ parseFile ff fp
185 case eParsed of
186 Right parsed -> do
187 let docs = splitEvery 500 $ take l parsed
188 flowCorpus u n la mfslw (map (map toHyperdataDocument) docs)
189 Left e -> panic $ "Error: " <> (T.pack e)
190
191 ------------------------------------------------------------------------
192 -- | TODO improve the needed type to create/update a corpus
193 -- (For now, Either is enough)
194 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
195 => User
196 -> Either CorpusName [CorpusId]
197 -> TermType Lang
198 -> Maybe FlowSocialListWith
199 -> [[a]]
200 -> m CorpusId
201 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
202
203
204 flow :: ( FlowCmdM env err m
205 , FlowCorpus a
206 , MkCorpus c
207 )
208 => Maybe c
209 -> User
210 -> Either CorpusName [CorpusId]
211 -> TermType Lang
212 -> Maybe FlowSocialListWith
213 -> [[a]]
214 -> m CorpusId
215 flow c u cn la mfslw docs = do
216 -- TODO if public insertMasterDocs else insertUserDocs
217 ids <- traverse (insertMasterDocs c la) docs
218 flowCorpusUser (la ^. tt_lang) u cn c (concat ids) mfslw
219
220 ------------------------------------------------------------------------
221 flowCorpusUser :: ( FlowCmdM env err m
222 , MkCorpus c
223 )
224 => Lang
225 -> User
226 -> Either CorpusName [CorpusId]
227 -> Maybe c
228 -> [NodeId]
229 -> Maybe FlowSocialListWith
230 -> m CorpusId
231 flowCorpusUser l user corpusName ctype ids mfslw = do
232 -- User Flow
233 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
234 -- NodeTexts is first
235 _tId <- insertDefaultNode NodeTexts userCorpusId userId
236 -- printDebug "NodeTexts: " tId
237
238 -- NodeList is second
239 listId <- getOrMkList userCorpusId userId
240 -- _cooc <- insertDefaultNode NodeListCooc listId userId
241 -- TODO: check if present already, ignore
242 _ <- Doc.add userCorpusId ids
243
244 -- printDebug "Node Text Ids:" tId
245
246 -- User List Flow
247 (masterUserId, _masterRootId, masterCorpusId)
248 <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
249
250 --let gp = (GroupParams l 2 3 (StopSize 3))
251 let gp = GroupWithPosTag l CoreNLP HashMap.empty
252 ngs <- buildNgramsLists user userCorpusId masterCorpusId mfslw gp
253
254 _userListId <- flowList_DbRepo listId ngs
255 _mastListId <- getOrMkList masterCorpusId masterUserId
256 -- _ <- insertOccsUpdates userCorpusId mastListId
257 -- printDebug "userListId" userListId
258 -- User Graph Flow
259 _ <- insertDefaultNode NodeDashboard userCorpusId userId
260 _ <- insertDefaultNode NodeGraph userCorpusId userId
261 --_ <- mkPhylo userCorpusId userId
262 -- Annuaire Flow
263 -- _ <- mkAnnuaire rootUserId userId
264 pure userCorpusId
265
266
267 insertMasterDocs :: ( FlowCmdM env err m
268 , FlowCorpus a
269 , MkCorpus c
270 )
271 => Maybe c
272 -> TermType Lang
273 -> [a]
274 -> m [DocId]
275 insertMasterDocs c lang hs = do
276 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
277 (ids', documentsWithId) <- insertDocs masterUserId masterCorpusId (map (toNode masterUserId masterCorpusId) hs )
278 _ <- Doc.add masterCorpusId ids'
279 -- TODO
280 -- create a corpus with database name (CSV or PubMed)
281 -- add documents to the corpus (create node_node link)
282 -- this will enable global database monitoring
283
284 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
285 mapNgramsDocs' :: HashMap ExtractedNgrams (Map NgramsType (Map NodeId Int))
286 <- mapNodeIdNgrams
287 <$> documentIdWithNgrams
288 (extractNgramsT $ withLang lang documentsWithId)
289 documentsWithId
290
291 lId <- getOrMkList masterCorpusId masterUserId
292 _ <- saveDocNgramsWith lId mapNgramsDocs'
293
294 -- _cooc <- insertDefaultNode NodeListCooc lId masterUserId
295 pure ids'
296
297 saveDocNgramsWith :: ( FlowCmdM env err m)
298 => ListId
299 -> HashMap ExtractedNgrams (Map NgramsType (Map NodeId Int))
300 -> m ()
301 saveDocNgramsWith lId mapNgramsDocs' = do
302 terms2id <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
303 let mapNgramsDocs = HashMap.mapKeys extracted2ngrams mapNgramsDocs'
304
305 -- to be removed
306 let indexedNgrams = HashMap.mapKeys (indexNgrams terms2id) mapNgramsDocs
307
308 -- new
309 mapCgramsId <- listInsertDb lId toNodeNgramsW'
310 $ map (first _ngramsTerms . second Map.keys)
311 $ HashMap.toList mapNgramsDocs
312
313 -- insertDocNgrams
314 _return <- insertNodeNodeNgrams2
315 $ catMaybes [ NodeNodeNgrams2 <$> Just nId
316 <*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms'')
317 <*> Just (fromIntegral w :: Double)
318 | (terms'', mapNgramsTypes) <- HashMap.toList mapNgramsDocs
319 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
320 , (nId, w) <- Map.toList mapNodeIdWeight
321 ]
322 -- to be removed
323 _ <- insertDocNgrams lId indexedNgrams
324
325 pure ()
326
327
328 ------------------------------------------------------------------------
329 -- TODO Type NodeDocumentUnicised
330 insertDocs :: ( FlowCmdM env err m
331 -- , FlowCorpus a
332 , FlowInsertDB a
333 )
334 => UserId
335 -> CorpusId
336 -> [a]
337 -> m ([DocId], [Indexed NodeId a])
338 insertDocs uId cId hs = do
339 let docs = map addUniqId hs
340 newIds <- insertDb uId cId docs
341 -- printDebug "newIds" newIds
342 let
343 newIds' = map reId newIds
344 documentsWithId = mergeData (toInserted newIds) (Map.fromList $ map viewUniqId' docs)
345 _ <- Doc.add cId newIds'
346 pure (newIds', documentsWithId)
347
348
349 ------------------------------------------------------------------------
350 viewUniqId' :: UniqId a
351 => a
352 -> (Hash, a)
353 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
354 where
355 err = panic "[ERROR] Database.Flow.toInsert"
356
357
358 toInserted :: [ReturnId]
359 -> Map Hash ReturnId
360 toInserted =
361 Map.fromList . map (\r -> (reUniqId r, r) )
362 . filter (\r -> reInserted r == True)
363
364 mergeData :: Map Hash ReturnId
365 -> Map Hash a
366 -> [Indexed NodeId a]
367 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
368 where
369 toDocumentWithId (sha,hpd) =
370 Indexed <$> fmap reId (lookup sha rs)
371 <*> Just hpd
372
373 ------------------------------------------------------------------------
374 ------------------------------------------------------------------------
375 ------------------------------------------------------------------------
376 documentIdWithNgrams :: HasNodeError err
377 => (a
378 -> Cmd err (HashMap b (Map NgramsType Int)))
379 -> [Indexed NodeId a]
380 -> Cmd err [DocumentIdWithNgrams a b]
381 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
382 where
383 toDocumentIdWithNgrams d = do
384 e <- f $ _unIndex d
385 pure $ DocumentIdWithNgrams d e
386
387
388 -- | TODO check optimization
389 mapNodeIdNgrams :: (Ord b, Hashable b)
390 => [DocumentIdWithNgrams a b]
391 -> HashMap b
392 (Map NgramsType
393 (Map NodeId Int)
394 )
395 mapNodeIdNgrams = HashMap.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
396 where
397 f :: DocumentIdWithNgrams a b
398 -> HashMap b (Map NgramsType (Map NodeId Int))
399 f d = fmap (fmap (Map.singleton nId)) $ documentNgrams d
400 where
401 nId = _index $ documentWithId d
402
403
404 ------------------------------------------------------------------------
405 instance ExtractNgramsT HyperdataContact
406 where
407 extractNgramsT l hc = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extract l hc
408 where
409 extract :: TermType Lang -> HyperdataContact
410 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
411 extract _l hc' = do
412 let authors = map text2ngrams
413 $ maybe ["Nothing"] (\a -> [a])
414 $ view (hc_who . _Just . cw_lastName) hc'
415
416 pure $ HashMap.fromList $ [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ]
417
418
419 instance ExtractNgramsT HyperdataDocument
420 where
421 extractNgramsT :: TermType Lang
422 -> HyperdataDocument
423 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
424 extractNgramsT lang hd = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extractNgramsT' lang hd
425 where
426 extractNgramsT' :: TermType Lang
427 -> HyperdataDocument
428 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
429 extractNgramsT' lang' doc = do
430 let source = text2ngrams
431 $ maybe "Nothing" identity
432 $ _hd_source doc
433
434 institutes = map text2ngrams
435 $ maybe ["Nothing"] (map toSchoolName . (T.splitOn ", "))
436 $ _hd_institutes doc
437
438 authors = map text2ngrams
439 $ maybe ["Nothing"] (T.splitOn ", ")
440 $ _hd_authors doc
441
442 terms' <- map (enrichedTerms (lang' ^. tt_lang) CoreNLP NP)
443 <$> concat
444 <$> liftBase (extractTerms lang' $ hasText doc)
445
446 pure $ HashMap.fromList
447 $ [(SimpleNgrams source, Map.singleton Sources 1) ]
448 <> [(SimpleNgrams i', Map.singleton Institutes 1) | i' <- institutes ]
449 <> [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ]
450 <> [(EnrichedNgrams t', Map.singleton NgramsTerms 1) | t' <- terms' ]
451
452 instance (ExtractNgramsT a, HasText a) => ExtractNgramsT (Node a)
453 where
454 extractNgramsT l (Node _ _ _ _ _ _ _ h) = extractNgramsT l h
455
456 instance HasText a => HasText (Node a)
457 where
458 hasText (Node _ _ _ _ _ _ _ h) = hasText h
459
460
461
462 -- | TODO putelsewhere
463 -- | Upgrade function
464 -- Suppose all documents are English (this is the case actually)
465 indexAllDocumentsWithPosTag :: FlowCmdM env err m => m ()
466 indexAllDocumentsWithPosTag = do
467 rootId <- getRootId (UserName userMaster)
468 corpusIds <- findNodesId rootId [NodeCorpus]
469 docs <- List.concat <$> mapM getDocumentsWithParentId corpusIds
470
471 _ <- mapM extractInsert (splitEvery 1000 docs)
472
473 pure ()
474
475 extractInsert :: FlowCmdM env err m => [Node HyperdataDocument] -> m ()
476 extractInsert docs = do
477 let documentsWithId = map (\doc -> Indexed (doc ^. node_id) doc) docs
478
479 mapNgramsDocs' <- mapNodeIdNgrams
480 <$> documentIdWithNgrams
481 (extractNgramsT $ withLang (Multi EN) documentsWithId)
482 documentsWithId
483
484 _ <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
485
486 pure ()
487
488