]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Flow.hs
Merge branch '104-dev-john-snow-nlp' of ssh://gitlab.iscpif.fr:20022/cgenie/haskell...
[gargantext.git] / src / Gargantext / Database / Action / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -- TODO-ACCESS:
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
13
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
16 -}
17
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
19
20 {-# LANGUAGE ConstrainedClassMethods #-}
21 {-# LANGUAGE ConstraintKinds #-}
22 {-# LANGUAGE InstanceSigs #-}
23 {-# LANGUAGE ScopedTypeVariables #-}
24 {-# LANGUAGE TemplateHaskell #-}
25
26 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
27 ( DataText(..)
28 , getDataText
29 , flowDataText
30 , flow
31
32 , flowCorpusFile
33 , flowCorpus
34 , flowAnnuaire
35 , insertMasterDocs
36 , saveDocNgramsWith
37
38 , getOrMkRoot
39 , getOrMk_RootWithCorpus
40 , TermType(..)
41 , DataOrigin(..)
42 , allDataOrigins
43
44 , do_api
45 , indexAllDocumentsWithPosTag
46 )
47 where
48
49 import Conduit
50 import Control.Lens ((^.), view, _Just, makeLenses)
51 import Data.Aeson.TH (deriveJSON)
52 import Data.Conduit.Internal (zipSources)
53 import Data.Either
54 import Data.HashMap.Strict (HashMap)
55 import Data.Hashable (Hashable)
56 import Data.List (concat)
57 import Data.Map (Map, lookup)
58 import Data.Maybe (catMaybes)
59 import Data.Monoid
60 import Data.Swagger
61 import qualified Data.Text as T
62 import Data.Traversable (traverse)
63 import Data.Tuple.Extra (first, second)
64 import GHC.Generics (Generic)
65 import Servant.Client (ClientError)
66 import System.FilePath (FilePath)
67 import qualified Data.HashMap.Strict as HashMap
68 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
69 import qualified Data.Map as Map
70
71 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..))
72 import Gargantext.Core (Lang(..), PosTagAlgo(..))
73 import Gargantext.Core.Ext.IMT (toSchoolName)
74 import Gargantext.Core.Ext.IMTUser (readFile_Annuaire)
75 import Gargantext.Core.Flow.Types
76 import Gargantext.Core.Text
77 import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat, FileType)
78 import Gargantext.Core.Text.List (buildNgramsLists)
79 import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
80 import Gargantext.Core.Text.List.Social (FlowSocialListWith)
81 import Gargantext.Core.Text.Terms
82 import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
83 import Gargantext.Core.Types (POS(NP))
84 import Gargantext.Core.Types.Individu (User(..))
85 import Gargantext.Core.Types.Main
86 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
87 import Gargantext.Database.Action.Flow.List
88 import Gargantext.Database.Action.Flow.Types
89 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams, DocumentIdWithNgrams(..))
90 import Gargantext.Database.Action.Search (searchDocInDatabase)
91 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
92 import Gargantext.Database.Action.Metrics (updateNgramsOccurrences)
93 import Gargantext.Database.Admin.Types.Hyperdata
94 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
95 import Gargantext.Database.Prelude
96 import Gargantext.Database.Query.Table.ContextNodeNgrams2
97 import Gargantext.Database.Query.Table.Ngrams
98 import Gargantext.Database.Query.Table.Node
99 import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
100 import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
101 import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
102 import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
103 import Gargantext.Database.Schema.Node (NodePoly(..), node_id)
104 import Gargantext.Database.Types
105 import Gargantext.Prelude
106 import Gargantext.Prelude.Crypto.Hash (Hash)
107 import qualified Gargantext.Core.Text.Corpus.API as API
108 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
109 import qualified Prelude
110
111 ------------------------------------------------------------------------
112 -- Imports for upgrade function
113 import Gargantext.Database.Query.Tree.Root (getRootId)
114 import Gargantext.Database.Query.Tree (findNodesId)
115 import qualified Data.List as List
116 ------------------------------------------------------------------------
117 -- TODO use internal with API name (could be old data)
118 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
119 | ExternalOrigin { _do_api :: API.ExternalAPIs }
120 -- TODO Web
121 deriving (Generic, Eq)
122
123 makeLenses ''DataOrigin
124 deriveJSON (unPrefix "_do_") ''DataOrigin
125 instance ToSchema DataOrigin where
126 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
127
128 allDataOrigins :: [DataOrigin]
129 allDataOrigins = map InternalOrigin API.externalAPIs
130 <> map ExternalOrigin API.externalAPIs
131
132 ---------------
133 data DataText = DataOld ![NodeId]
134 | DataNew !(Maybe Integer, ConduitT () HyperdataDocument IO ())
135 -- | DataNew ![[HyperdataDocument]]
136
137 -- TODO use the split parameter in config file
138 getDataText :: FlowCmdM env err m
139 => DataOrigin
140 -> TermType Lang
141 -> API.Query
142 -> Maybe API.Limit
143 -> m (Either ClientError DataText)
144 getDataText (ExternalOrigin api) la q li = liftBase $ do
145 eRes <- API.get api (_tt_lang la) q li
146 pure $ DataNew <$> eRes
147
148 getDataText (InternalOrigin _) _la q _li = do
149 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
150 (UserName userMaster)
151 (Left "")
152 (Nothing :: Maybe HyperdataCorpus)
153 ids <- map fst <$> searchDocInDatabase cId (stemIt q)
154 pure $ Right $ DataOld ids
155
156 -------------------------------------------------------------------------------
157 flowDataText :: forall env err m.
158 ( FlowCmdM env err m
159 )
160 => User
161 -> DataText
162 -> TermType Lang
163 -> CorpusId
164 -> Maybe FlowSocialListWith
165 -> (JobLog -> m ())
166 -> m CorpusId
167 flowDataText u (DataOld ids) tt cid mfslw _ = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids mfslw
168 where
169 corpusType = (Nothing :: Maybe HyperdataCorpus)
170 flowDataText u (DataNew (mLen, txtC)) tt cid mfslw logStatus =
171 flowCorpus u (Right [cid]) tt mfslw (mLen, (transPipe liftBase txtC)) logStatus
172
173 ------------------------------------------------------------------------
174 -- TODO use proxy
175 flowAnnuaire :: (FlowCmdM env err m)
176 => User
177 -> Either CorpusName [CorpusId]
178 -> (TermType Lang)
179 -> FilePath
180 -> (JobLog -> m ())
181 -> m AnnuaireId
182 flowAnnuaire u n l filePath logStatus = do
183 -- TODO Conduit for file
184 docs <- liftBase $ ((readFile_Annuaire filePath) :: IO [HyperdataContact])
185 flow (Nothing :: Maybe HyperdataAnnuaire) u n l Nothing (Just $ fromIntegral $ length docs, yieldMany docs) logStatus
186
187 ------------------------------------------------------------------------
188 flowCorpusFile :: (FlowCmdM env err m)
189 => User
190 -> Either CorpusName [CorpusId]
191 -> Limit -- Limit the number of docs (for dev purpose)
192 -> TermType Lang
193 -> FileType
194 -> FileFormat
195 -> FilePath
196 -> Maybe FlowSocialListWith
197 -> (JobLog -> m ())
198 -> m CorpusId
199 flowCorpusFile u n _l la ft ff fp mfslw logStatus = do
200 eParsed <- liftBase $ parseFile ft ff fp
201 case eParsed of
202 Right parsed -> do
203 flowCorpus u n la mfslw (Just $ fromIntegral $ length parsed, yieldMany parsed .| mapC toHyperdataDocument) logStatus
204 --let docs = splitEvery 500 $ take l parsed
205 --flowCorpus u n la mfslw (yieldMany $ map (map toHyperdataDocument) docs) logStatus
206 Left e -> panic $ "Error: " <> T.pack e
207
208 ------------------------------------------------------------------------
209 -- | TODO improve the needed type to create/update a corpus
210 -- (For now, Either is enough)
211 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
212 => User
213 -> Either CorpusName [CorpusId]
214 -> TermType Lang
215 -> Maybe FlowSocialListWith
216 -> (Maybe Integer, ConduitT () a m ())
217 -> (JobLog -> m ())
218 -> m CorpusId
219 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
220
221
222 flow :: forall env err m a c.
223 ( FlowCmdM env err m
224 , FlowCorpus a
225 , MkCorpus c
226 )
227 => Maybe c
228 -> User
229 -> Either CorpusName [CorpusId]
230 -> TermType Lang
231 -> Maybe FlowSocialListWith
232 -> (Maybe Integer, ConduitT () a m ())
233 -> (JobLog -> m ())
234 -> m CorpusId
235 flow c u cn la mfslw (mLength, docsC) logStatus = do
236 -- TODO if public insertMasterDocs else insertUserDocs
237 ids <- runConduit $
238 zipSources (yieldMany [1..]) docsC
239 .| mapMC insertDoc
240 .| sinkList
241 -- ids <- traverse (\(idx, doc) -> do
242 -- id <- insertMasterDocs c la doc
243 -- logStatus JobLog { _scst_succeeded = Just $ 1 + idx
244 -- , _scst_failed = Just 0
245 -- , _scst_remaining = Just $ length docs - idx
246 -- , _scst_events = Just []
247 -- }
248 -- pure id
249 -- ) (zip [1..] docs)
250 flowCorpusUser (la ^. tt_lang) u cn c ids mfslw
251
252 where
253 insertDoc :: (Integer, a) -> m NodeId
254 insertDoc (idx, doc) = do
255 id <- insertMasterDocs c la [doc]
256 case mLength of
257 Nothing -> pure ()
258 Just len -> do
259 logStatus JobLog { _scst_succeeded = Just $ fromIntegral $ 1 + idx
260 , _scst_failed = Just 0
261 , _scst_remaining = Just $ fromIntegral $ len - idx
262 , _scst_events = Just []
263 }
264 pure $ Prelude.head id
265
266
267
268 ------------------------------------------------------------------------
269 flowCorpusUser :: ( FlowCmdM env err m
270 , MkCorpus c
271 )
272 => Lang
273 -> User
274 -> Either CorpusName [CorpusId]
275 -> Maybe c
276 -> [NodeId]
277 -> Maybe FlowSocialListWith
278 -> m CorpusId
279 flowCorpusUser l user corpusName ctype ids mfslw = do
280 -- User Flow
281 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
282 -- NodeTexts is first
283 _tId <- insertDefaultNodeIfNotExists NodeTexts userCorpusId userId
284 -- printDebug "NodeTexts: " tId
285
286 -- NodeList is second
287 listId <- getOrMkList userCorpusId userId
288 -- _cooc <- insertDefaultNode NodeListCooc listId userId
289 -- TODO: check if present already, ignore
290 _ <- Doc.add userCorpusId ids
291
292 -- printDebug "Node Text Ids:" tId
293
294 -- User List Flow
295 (masterUserId, _masterRootId, masterCorpusId)
296 <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
297
298 --let gp = (GroupParams l 2 3 (StopSize 3))
299 -- Here the PosTagAlgo should be chosen according the Lang
300 let gp = GroupWithPosTag l CoreNLP HashMap.empty
301 ngs <- buildNgramsLists user userCorpusId masterCorpusId mfslw gp
302
303 -- printDebug "flowCorpusUser:ngs" ngs
304
305 _userListId <- flowList_DbRepo listId ngs
306 _mastListId <- getOrMkList masterCorpusId masterUserId
307 -- _ <- insertOccsUpdates userCorpusId mastListId
308 -- printDebug "userListId" userListId
309 -- User Graph Flow
310 _ <- insertDefaultNodeIfNotExists NodeDashboard userCorpusId userId
311 _ <- insertDefaultNodeIfNotExists NodeGraph userCorpusId userId
312 --_ <- mkPhylo userCorpusId userId
313 -- Annuaire Flow
314 -- _ <- mkAnnuaire rootUserId userId
315 _ <- updateNgramsOccurrences userCorpusId (Just listId)
316
317 pure userCorpusId
318
319
320 insertMasterDocs :: ( FlowCmdM env err m
321 , FlowCorpus a
322 , MkCorpus c
323 )
324 => Maybe c
325 -> TermType Lang
326 -> [a]
327 -> m [DocId]
328 insertMasterDocs c lang hs = do
329 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
330 (ids', documentsWithId) <- insertDocs masterUserId masterCorpusId (map (toNode masterUserId masterCorpusId) hs )
331 _ <- Doc.add masterCorpusId ids'
332 -- TODO
333 -- create a corpus with database name (CSV or PubMed)
334 -- add documents to the corpus (create node_node link)
335 -- this will enable global database monitoring
336
337 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
338 mapNgramsDocs' :: HashMap ExtractedNgrams (Map NgramsType (Map NodeId Int))
339 <- mapNodeIdNgrams
340 <$> documentIdWithNgrams
341 (extractNgramsT $ withLang lang documentsWithId)
342 documentsWithId
343
344 lId <- getOrMkList masterCorpusId masterUserId
345 _ <- saveDocNgramsWith lId mapNgramsDocs'
346
347 -- _cooc <- insertDefaultNode NodeListCooc lId masterUserId
348 pure ids'
349
350 saveDocNgramsWith :: ( FlowCmdM env err m)
351 => ListId
352 -> HashMap ExtractedNgrams (Map NgramsType (Map NodeId Int))
353 -> m ()
354 saveDocNgramsWith lId mapNgramsDocs' = do
355 terms2id <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
356 --printDebug "terms2id" terms2id
357
358 let mapNgramsDocs = HashMap.mapKeys extracted2ngrams mapNgramsDocs'
359
360 -- new
361 mapCgramsId <- listInsertDb lId toNodeNgramsW'
362 $ map (first _ngramsTerms . second Map.keys)
363 $ HashMap.toList mapNgramsDocs
364
365 --printDebug "saveDocNgramsWith" mapCgramsId
366 -- insertDocNgrams
367 _return <- insertContextNodeNgrams2
368 $ catMaybes [ ContextNodeNgrams2 <$> Just nId
369 <*> (getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms''))
370 <*> Just (fromIntegral w :: Double)
371 | (terms'', mapNgramsTypes) <- HashMap.toList mapNgramsDocs
372 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
373 , (nId, w) <- Map.toList mapNodeIdWeight
374 ]
375
376 -- to be removed
377 _ <- insertDocNgrams lId $ HashMap.mapKeys (indexNgrams terms2id) mapNgramsDocs
378
379 pure ()
380
381
382 ------------------------------------------------------------------------
383 -- TODO Type NodeDocumentUnicised
384 insertDocs :: ( FlowCmdM env err m
385 -- , FlowCorpus a
386 , FlowInsertDB a
387 )
388 => UserId
389 -> CorpusId
390 -> [a]
391 -> m ([ContextId], [Indexed ContextId a])
392 insertDocs uId cId hs = do
393 let docs = map addUniqId hs
394 newIds <- insertDb uId cId docs
395 -- printDebug "newIds" newIds
396 let
397 newIds' = map reId newIds
398 documentsWithId = mergeData (toInserted newIds) (Map.fromList $ map viewUniqId' docs)
399 _ <- Doc.add cId newIds'
400 pure (newIds', documentsWithId)
401
402
403 ------------------------------------------------------------------------
404 viewUniqId' :: UniqId a
405 => a
406 -> (Hash, a)
407 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
408 where
409 err = panic "[ERROR] Database.Flow.toInsert"
410
411
412 toInserted :: [ReturnId]
413 -> Map Hash ReturnId
414 toInserted =
415 Map.fromList . map (\r -> (reUniqId r, r) )
416 . filter (\r -> reInserted r == True)
417
418 mergeData :: Map Hash ReturnId
419 -> Map Hash a
420 -> [Indexed NodeId a]
421 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
422 where
423 toDocumentWithId (sha,hpd) =
424 Indexed <$> fmap reId (lookup sha rs)
425 <*> Just hpd
426
427 ------------------------------------------------------------------------
428 ------------------------------------------------------------------------
429 ------------------------------------------------------------------------
430 documentIdWithNgrams :: HasNodeError err
431 => (a
432 -> Cmd err (HashMap b (Map NgramsType Int)))
433 -> [Indexed NodeId a]
434 -> Cmd err [DocumentIdWithNgrams a b]
435 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
436 where
437 toDocumentIdWithNgrams d = do
438 e <- f $ _unIndex d
439 pure $ DocumentIdWithNgrams d e
440
441
442 -- | TODO check optimization
443 mapNodeIdNgrams :: (Ord b, Hashable b)
444 => [DocumentIdWithNgrams a b]
445 -> HashMap b
446 (Map NgramsType
447 (Map NodeId Int)
448 )
449 mapNodeIdNgrams = HashMap.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
450 where
451 f :: DocumentIdWithNgrams a b
452 -> HashMap b (Map NgramsType (Map NodeId Int))
453 f d = fmap (fmap (Map.singleton nId)) $ documentNgrams d
454 where
455 nId = _index $ documentWithId d
456
457
458 ------------------------------------------------------------------------
459 instance ExtractNgramsT HyperdataContact
460 where
461 extractNgramsT l hc = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extract l hc
462 where
463 extract :: TermType Lang -> HyperdataContact
464 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
465 extract _l hc' = do
466 let authors = map text2ngrams
467 $ maybe ["Nothing"] (\a -> [a])
468 $ view (hc_who . _Just . cw_lastName) hc'
469
470 pure $ HashMap.fromList $ [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ]
471
472
473 instance ExtractNgramsT HyperdataDocument
474 where
475 extractNgramsT :: TermType Lang
476 -> HyperdataDocument
477 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
478 extractNgramsT lang hd = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extractNgramsT' lang hd
479 where
480 extractNgramsT' :: TermType Lang
481 -> HyperdataDocument
482 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
483 extractNgramsT' lang' doc = do
484 let source = text2ngrams
485 $ maybe "Nothing" identity
486 $ _hd_source doc
487
488 institutes = map text2ngrams
489 $ maybe ["Nothing"] (map toSchoolName . (T.splitOn ", "))
490 $ _hd_institutes doc
491
492 authors = map text2ngrams
493 $ maybe ["Nothing"] (T.splitOn ", ")
494 $ _hd_authors doc
495
496 terms' <- map (enrichedTerms (lang' ^. tt_lang) CoreNLP NP)
497 <$> concat
498 <$> liftBase (extractTerms lang' $ hasText doc)
499
500 pure $ HashMap.fromList
501 $ [(SimpleNgrams source, Map.singleton Sources 1) ]
502 <> [(SimpleNgrams i', Map.singleton Institutes 1) | i' <- institutes ]
503 <> [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ]
504 <> [(EnrichedNgrams t', Map.singleton NgramsTerms 1) | t' <- terms' ]
505
506 instance (ExtractNgramsT a, HasText a) => ExtractNgramsT (Node a)
507 where
508 extractNgramsT l (Node _ _ _ _ _ _ _ h) = extractNgramsT l h
509
510 instance HasText a => HasText (Node a)
511 where
512 hasText (Node _ _ _ _ _ _ _ h) = hasText h
513
514
515
516 -- | TODO putelsewhere
517 -- | Upgrade function
518 -- Suppose all documents are English (this is the case actually)
519 indexAllDocumentsWithPosTag :: FlowCmdM env err m
520 => m ()
521 indexAllDocumentsWithPosTag = do
522 rootId <- getRootId (UserName userMaster)
523 corpusIds <- findNodesId rootId [NodeCorpus]
524 docs <- List.concat <$> mapM getDocumentsWithParentId corpusIds
525 _ <- mapM extractInsert (splitEvery 1000 docs)
526 pure ()
527
528 extractInsert :: FlowCmdM env err m
529 => [Node HyperdataDocument] -> m ()
530 extractInsert docs = do
531 let documentsWithId = map (\doc -> Indexed (doc ^. node_id) doc) docs
532 mapNgramsDocs' <- mapNodeIdNgrams
533 <$> documentIdWithNgrams
534 (extractNgramsT $ withLang (Multi EN) documentsWithId)
535 documentsWithId
536 _ <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
537 pure ()