]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Flow.hs
[FEAT][PHYLO] corpusId to Documents function
[gargantext.git] / src / Gargantext / Database / Action / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -- TODO-ACCESS:
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
13
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
16 -}
17
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
19
20 {-# LANGUAGE ConstrainedClassMethods #-}
21 {-# LANGUAGE ConstraintKinds #-}
22 {-# LANGUAGE InstanceSigs #-}
23 {-# LANGUAGE ScopedTypeVariables #-}
24 {-# LANGUAGE TemplateHaskell #-}
25
26 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
27 ( DataText(..)
28 , getDataText
29 , flowDataText
30 , flow
31
32 , flowCorpusFile
33 , flowCorpus
34 , flowAnnuaire
35 , insertMasterDocs
36 , saveDocNgramsWith
37
38 , getOrMkRoot
39 , getOrMk_RootWithCorpus
40 , TermType(..)
41 , DataOrigin(..)
42 , allDataOrigins
43
44 , do_api
45 , indexAllDocumentsWithPosTag
46 )
47 where
48
49 import Control.Lens ((^.), view, _Just, makeLenses)
50 import Data.Aeson.TH (deriveJSON)
51 import Data.Either
52 import Data.HashMap.Strict (HashMap)
53 import Data.Hashable (Hashable)
54 import Data.List (concat)
55 import Data.Map (Map, lookup)
56 import Data.Maybe (catMaybes)
57 import Data.Monoid
58 import Data.Swagger
59 import qualified Data.Text as T
60 import Data.Traversable (traverse)
61 import Data.Tuple.Extra (first, second)
62 import GHC.Generics (Generic)
63 import System.FilePath (FilePath)
64 import qualified Data.HashMap.Strict as HashMap
65 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
66 import qualified Data.Map as Map
67
68 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..))
69 import Gargantext.Core (Lang(..), PosTagAlgo(..))
70 import Gargantext.Core.Ext.IMT (toSchoolName)
71 import Gargantext.Core.Ext.IMTUser (readFile_Annuaire)
72 import Gargantext.Core.Flow.Types
73 import Gargantext.Core.Text
74 import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat)
75 import Gargantext.Core.Text.List (buildNgramsLists)
76 import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
77 import Gargantext.Core.Text.List.Social (FlowSocialListWith)
78 import Gargantext.Core.Text.Terms
79 import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
80 import Gargantext.Core.Types (POS(NP))
81 import Gargantext.Core.Types.Individu (User(..))
82 import Gargantext.Core.Types.Main
83 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
84 import Gargantext.Database.Action.Flow.List
85 import Gargantext.Database.Action.Flow.Types
86 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams, DocumentIdWithNgrams(..))
87 import Gargantext.Database.Action.Search (searchDocInDatabase)
88 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
89 import Gargantext.Database.Action.Metrics (updateNgramsOccurrences)
90 import Gargantext.Database.Admin.Types.Hyperdata
91 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
92 import Gargantext.Database.Prelude
93 import Gargantext.Database.Query.Table.ContextNodeNgrams2
94 import Gargantext.Database.Query.Table.Ngrams
95 import Gargantext.Database.Query.Table.Node
96 import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
97 import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
98 import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
99 import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
100 import Gargantext.Database.Schema.Node (NodePoly(..), node_id)
101 import Gargantext.Database.Types
102 import Gargantext.Prelude
103 import Gargantext.Prelude.Crypto.Hash (Hash)
104 import qualified Gargantext.Core.Text.Corpus.API as API
105 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
106
107 ------------------------------------------------------------------------
108 -- Imports for upgrade function
109 import Gargantext.Database.Query.Tree.Root (getRootId)
110 import Gargantext.Database.Query.Tree (findNodesId)
111 import qualified Data.List as List
112 ------------------------------------------------------------------------
113 -- TODO use internal with API name (could be old data)
114 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
115 | ExternalOrigin { _do_api :: API.ExternalAPIs }
116 -- TODO Web
117 deriving (Generic, Eq)
118
119 makeLenses ''DataOrigin
120 deriveJSON (unPrefix "_do_") ''DataOrigin
121 instance ToSchema DataOrigin where
122 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
123
124 allDataOrigins :: [DataOrigin]
125 allDataOrigins = map InternalOrigin API.externalAPIs
126 <> map ExternalOrigin API.externalAPIs
127
128 ---------------
129 data DataText = DataOld ![NodeId]
130 | DataNew ![[HyperdataDocument]]
131
132 -- TODO use the split parameter in config file
133 getDataText :: FlowCmdM env err m
134 => DataOrigin
135 -> TermType Lang
136 -> API.Query
137 -> Maybe API.Limit
138 -> m DataText
139 getDataText (ExternalOrigin api) la q li = liftBase $ DataNew
140 <$> splitEvery 500
141 <$> API.get api (_tt_lang la) q li
142
143 getDataText (InternalOrigin _) _la q _li = do
144 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
145 (UserName userMaster)
146 (Left "")
147 (Nothing :: Maybe HyperdataCorpus)
148 ids <- map fst <$> searchDocInDatabase cId (stemIt q)
149 pure $ DataOld ids
150
151 -------------------------------------------------------------------------------
152 flowDataText :: ( FlowCmdM env err m
153 )
154 => User
155 -> DataText
156 -> TermType Lang
157 -> CorpusId
158 -> Maybe FlowSocialListWith
159 -> (JobLog -> m ())
160 -> m CorpusId
161 flowDataText u (DataOld ids) tt cid mfslw _ = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids mfslw
162 where
163 corpusType = (Nothing :: Maybe HyperdataCorpus)
164 flowDataText u (DataNew txt) tt cid mfslw logStatus = flowCorpus u (Right [cid]) tt mfslw txt logStatus
165
166 ------------------------------------------------------------------------
167 -- TODO use proxy
168 flowAnnuaire :: (FlowCmdM env err m)
169 => User
170 -> Either CorpusName [CorpusId]
171 -> (TermType Lang)
172 -> FilePath
173 -> (JobLog -> m ())
174 -> m AnnuaireId
175 flowAnnuaire u n l filePath logStatus = do
176 docs <- liftBase $ (( splitEvery 500 <$> readFile_Annuaire filePath) :: IO [[HyperdataContact]])
177 flow (Nothing :: Maybe HyperdataAnnuaire) u n l Nothing docs logStatus
178
179 ------------------------------------------------------------------------
180 flowCorpusFile :: (FlowCmdM env err m)
181 => User
182 -> Either CorpusName [CorpusId]
183 -> Limit -- Limit the number of docs (for dev purpose)
184 -> TermType Lang -> FileFormat -> FilePath
185 -> Maybe FlowSocialListWith
186 -> (JobLog -> m ())
187 -> m CorpusId
188 flowCorpusFile u n l la ff fp mfslw logStatus = do
189 eParsed <- liftBase $ parseFile ff fp
190 case eParsed of
191 Right parsed -> do
192 let docs = splitEvery 500 $ take l parsed
193 flowCorpus u n la mfslw (map (map toHyperdataDocument) docs) logStatus
194 Left e -> panic $ "Error: " <> (T.pack e)
195
196 ------------------------------------------------------------------------
197 -- | TODO improve the needed type to create/update a corpus
198 -- (For now, Either is enough)
199 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
200 => User
201 -> Either CorpusName [CorpusId]
202 -> TermType Lang
203 -> Maybe FlowSocialListWith
204 -> [[a]]
205 -> (JobLog -> m ())
206 -> m CorpusId
207 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
208
209
210 flow :: ( FlowCmdM env err m
211 , FlowCorpus a
212 , MkCorpus c
213 )
214 => Maybe c
215 -> User
216 -> Either CorpusName [CorpusId]
217 -> TermType Lang
218 -> Maybe FlowSocialListWith
219 -> [[a]]
220 -> (JobLog -> m ())
221 -> m CorpusId
222 flow c u cn la mfslw docs logStatus = do
223 -- TODO if public insertMasterDocs else insertUserDocs
224 ids <- traverse (\(idx, doc) -> do
225 id <- insertMasterDocs c la doc
226 logStatus JobLog { _scst_succeeded = Just $ 1 + idx
227 , _scst_failed = Just 0
228 , _scst_remaining = Just $ length docs - idx
229 , _scst_events = Just []
230 }
231 pure id
232 ) (zip [1..] docs)
233 flowCorpusUser (la ^. tt_lang) u cn c (concat ids) mfslw
234
235
236
237
238 ------------------------------------------------------------------------
239 flowCorpusUser :: ( FlowCmdM env err m
240 , MkCorpus c
241 )
242 => Lang
243 -> User
244 -> Either CorpusName [CorpusId]
245 -> Maybe c
246 -> [NodeId]
247 -> Maybe FlowSocialListWith
248 -> m CorpusId
249 flowCorpusUser l user corpusName ctype ids mfslw = do
250 -- User Flow
251 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
252 -- NodeTexts is first
253 _tId <- insertDefaultNode NodeTexts userCorpusId userId
254 -- printDebug "NodeTexts: " tId
255
256 -- NodeList is second
257 listId <- getOrMkList userCorpusId userId
258 -- _cooc <- insertDefaultNode NodeListCooc listId userId
259 -- TODO: check if present already, ignore
260 _ <- Doc.add userCorpusId ids
261
262 -- printDebug "Node Text Ids:" tId
263
264 -- User List Flow
265 (masterUserId, _masterRootId, masterCorpusId)
266 <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
267
268 --let gp = (GroupParams l 2 3 (StopSize 3))
269 let gp = GroupWithPosTag l CoreNLP HashMap.empty
270 ngs <- buildNgramsLists user userCorpusId masterCorpusId mfslw gp
271
272 -- printDebug "flowCorpusUser:ngs" ngs
273
274 _userListId <- flowList_DbRepo listId ngs
275 _mastListId <- getOrMkList masterCorpusId masterUserId
276 -- _ <- insertOccsUpdates userCorpusId mastListId
277 -- printDebug "userListId" userListId
278 -- User Graph Flow
279 _ <- insertDefaultNode NodeDashboard userCorpusId userId
280 _ <- insertDefaultNode NodeGraph userCorpusId userId
281 --_ <- mkPhylo userCorpusId userId
282 -- Annuaire Flow
283 -- _ <- mkAnnuaire rootUserId userId
284 _ <- updateNgramsOccurrences userCorpusId (Just listId)
285
286 pure userCorpusId
287
288
289 insertMasterDocs :: ( FlowCmdM env err m
290 , FlowCorpus a
291 , MkCorpus c
292 )
293 => Maybe c
294 -> TermType Lang
295 -> [a]
296 -> m [DocId]
297 insertMasterDocs c lang hs = do
298 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
299 (ids', documentsWithId) <- insertDocs masterUserId masterCorpusId (map (toNode masterUserId masterCorpusId) hs )
300 _ <- Doc.add masterCorpusId ids'
301 -- TODO
302 -- create a corpus with database name (CSV or PubMed)
303 -- add documents to the corpus (create node_node link)
304 -- this will enable global database monitoring
305
306 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
307 mapNgramsDocs' :: HashMap ExtractedNgrams (Map NgramsType (Map NodeId Int))
308 <- mapNodeIdNgrams
309 <$> documentIdWithNgrams
310 (extractNgramsT $ withLang lang documentsWithId)
311 documentsWithId
312
313 lId <- getOrMkList masterCorpusId masterUserId
314 _ <- saveDocNgramsWith lId mapNgramsDocs'
315
316 -- _cooc <- insertDefaultNode NodeListCooc lId masterUserId
317 pure ids'
318
319 saveDocNgramsWith :: ( FlowCmdM env err m)
320 => ListId
321 -> HashMap ExtractedNgrams (Map NgramsType (Map NodeId Int))
322 -> m ()
323 saveDocNgramsWith lId mapNgramsDocs' = do
324 terms2id <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
325 printDebug "terms2id" terms2id
326
327 let mapNgramsDocs = HashMap.mapKeys extracted2ngrams mapNgramsDocs'
328
329 -- new
330 mapCgramsId <- listInsertDb lId toNodeNgramsW'
331 $ map (first _ngramsTerms . second Map.keys)
332 $ HashMap.toList mapNgramsDocs
333
334 printDebug "saveDocNgramsWith" mapCgramsId
335 -- insertDocNgrams
336 _return <- insertContextNodeNgrams2
337 $ catMaybes [ ContextNodeNgrams2 <$> Just nId
338 <*> (getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms''))
339 <*> Just (fromIntegral w :: Double)
340 | (terms'', mapNgramsTypes) <- HashMap.toList mapNgramsDocs
341 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
342 , (nId, w) <- Map.toList mapNodeIdWeight
343 ]
344
345 -- to be removed
346 _ <- insertDocNgrams lId $ HashMap.mapKeys (indexNgrams terms2id) mapNgramsDocs
347
348 pure ()
349
350
351 ------------------------------------------------------------------------
352 -- TODO Type NodeDocumentUnicised
353 insertDocs :: ( FlowCmdM env err m
354 -- , FlowCorpus a
355 , FlowInsertDB a
356 )
357 => UserId
358 -> CorpusId
359 -> [a]
360 -> m ([ContextId], [Indexed ContextId a])
361 insertDocs uId cId hs = do
362 let docs = map addUniqId hs
363 newIds <- insertDb uId cId docs
364 -- printDebug "newIds" newIds
365 let
366 newIds' = map reId newIds
367 documentsWithId = mergeData (toInserted newIds) (Map.fromList $ map viewUniqId' docs)
368 _ <- Doc.add cId newIds'
369 pure (newIds', documentsWithId)
370
371
372 ------------------------------------------------------------------------
373 viewUniqId' :: UniqId a
374 => a
375 -> (Hash, a)
376 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
377 where
378 err = panic "[ERROR] Database.Flow.toInsert"
379
380
381 toInserted :: [ReturnId]
382 -> Map Hash ReturnId
383 toInserted =
384 Map.fromList . map (\r -> (reUniqId r, r) )
385 . filter (\r -> reInserted r == True)
386
387 mergeData :: Map Hash ReturnId
388 -> Map Hash a
389 -> [Indexed NodeId a]
390 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
391 where
392 toDocumentWithId (sha,hpd) =
393 Indexed <$> fmap reId (lookup sha rs)
394 <*> Just hpd
395
396 ------------------------------------------------------------------------
397 ------------------------------------------------------------------------
398 ------------------------------------------------------------------------
399 documentIdWithNgrams :: HasNodeError err
400 => (a
401 -> Cmd err (HashMap b (Map NgramsType Int)))
402 -> [Indexed NodeId a]
403 -> Cmd err [DocumentIdWithNgrams a b]
404 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
405 where
406 toDocumentIdWithNgrams d = do
407 e <- f $ _unIndex d
408 pure $ DocumentIdWithNgrams d e
409
410
411 -- | TODO check optimization
412 mapNodeIdNgrams :: (Ord b, Hashable b)
413 => [DocumentIdWithNgrams a b]
414 -> HashMap b
415 (Map NgramsType
416 (Map NodeId Int)
417 )
418 mapNodeIdNgrams = HashMap.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
419 where
420 f :: DocumentIdWithNgrams a b
421 -> HashMap b (Map NgramsType (Map NodeId Int))
422 f d = fmap (fmap (Map.singleton nId)) $ documentNgrams d
423 where
424 nId = _index $ documentWithId d
425
426
427 ------------------------------------------------------------------------
428 instance ExtractNgramsT HyperdataContact
429 where
430 extractNgramsT l hc = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extract l hc
431 where
432 extract :: TermType Lang -> HyperdataContact
433 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
434 extract _l hc' = do
435 let authors = map text2ngrams
436 $ maybe ["Nothing"] (\a -> [a])
437 $ view (hc_who . _Just . cw_lastName) hc'
438
439 pure $ HashMap.fromList $ [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ]
440
441
442 instance ExtractNgramsT HyperdataDocument
443 where
444 extractNgramsT :: TermType Lang
445 -> HyperdataDocument
446 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
447 extractNgramsT lang hd = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extractNgramsT' lang hd
448 where
449 extractNgramsT' :: TermType Lang
450 -> HyperdataDocument
451 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
452 extractNgramsT' lang' doc = do
453 let source = text2ngrams
454 $ maybe "Nothing" identity
455 $ _hd_source doc
456
457 institutes = map text2ngrams
458 $ maybe ["Nothing"] (map toSchoolName . (T.splitOn ", "))
459 $ _hd_institutes doc
460
461 authors = map text2ngrams
462 $ maybe ["Nothing"] (T.splitOn ", ")
463 $ _hd_authors doc
464
465 terms' <- map (enrichedTerms (lang' ^. tt_lang) CoreNLP NP)
466 <$> concat
467 <$> liftBase (extractTerms lang' $ hasText doc)
468
469 pure $ HashMap.fromList
470 $ [(SimpleNgrams source, Map.singleton Sources 1) ]
471 <> [(SimpleNgrams i', Map.singleton Institutes 1) | i' <- institutes ]
472 <> [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ]
473 <> [(EnrichedNgrams t', Map.singleton NgramsTerms 1) | t' <- terms' ]
474
475 instance (ExtractNgramsT a, HasText a) => ExtractNgramsT (Node a)
476 where
477 extractNgramsT l (Node _ _ _ _ _ _ _ h) = extractNgramsT l h
478
479 instance HasText a => HasText (Node a)
480 where
481 hasText (Node _ _ _ _ _ _ _ h) = hasText h
482
483
484
485 -- | TODO putelsewhere
486 -- | Upgrade function
487 -- Suppose all documents are English (this is the case actually)
488 indexAllDocumentsWithPosTag :: FlowCmdM env err m
489 => m ()
490 indexAllDocumentsWithPosTag = do
491 rootId <- getRootId (UserName userMaster)
492 corpusIds <- findNodesId rootId [NodeCorpus]
493 docs <- List.concat <$> mapM getDocumentsWithParentId corpusIds
494 _ <- mapM extractInsert (splitEvery 1000 docs)
495 pure ()
496
497 extractInsert :: FlowCmdM env err m
498 => [Node HyperdataDocument] -> m ()
499 extractInsert docs = do
500 let documentsWithId = map (\doc -> Indexed (doc ^. node_id) doc) docs
501 mapNgramsDocs' <- mapNodeIdNgrams
502 <$> documentIdWithNgrams
503 (extractNgramsT $ withLang (Multi EN) documentsWithId)
504 documentsWithId
505 _ <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
506 pure ()
507
508