]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Flow.hs
[john-snow] implement pos/lemma language
[gargantext.git] / src / Gargantext / Database / Action / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -- TODO-ACCESS:
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
13
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
16 -}
17
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
19
20 {-# LANGUAGE ConstrainedClassMethods #-}
21 {-# LANGUAGE ConstraintKinds #-}
22 {-# LANGUAGE InstanceSigs #-}
23 {-# LANGUAGE ScopedTypeVariables #-}
24 {-# LANGUAGE TemplateHaskell #-}
25
26 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
27 ( DataText(..)
28 , getDataText
29 , flowDataText
30 , flow
31
32 , flowCorpusFile
33 , flowCorpus
34 , flowAnnuaire
35 , insertMasterDocs
36 , saveDocNgramsWith
37
38 , getOrMkRoot
39 , getOrMk_RootWithCorpus
40 , TermType(..)
41 , DataOrigin(..)
42 , allDataOrigins
43
44 , do_api
45 , indexAllDocumentsWithPosTag
46 )
47 where
48
49 import Control.Lens ((^.), view, _Just, makeLenses)
50 import Data.Aeson.TH (deriveJSON)
51 import Data.Either
52 import Data.HashMap.Strict (HashMap)
53 import Data.Hashable (Hashable)
54 import Data.List (concat)
55 import Data.Map (Map, lookup)
56 import Data.Maybe (catMaybes)
57 import Data.Monoid
58 import Data.Swagger
59 import qualified Data.Text as T
60 import Data.Traversable (traverse)
61 import Data.Tuple.Extra (first, second)
62 import GHC.Generics (Generic)
63 import System.FilePath (FilePath)
64 import qualified Data.HashMap.Strict as HashMap
65 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
66 import qualified Data.Map as Map
67
68 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..))
69 import Gargantext.Core (Lang(..), PosTagAlgo(..))
70 import Gargantext.Core.Ext.IMT (toSchoolName)
71 import Gargantext.Core.Ext.IMTUser (readFile_Annuaire)
72 import Gargantext.Core.Flow.Types
73 import Gargantext.Core.Text
74 import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat)
75 import Gargantext.Core.Text.List (buildNgramsLists)
76 import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
77 import Gargantext.Core.Text.List.Social (FlowSocialListWith)
78 import Gargantext.Core.Text.Terms
79 import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
80 import Gargantext.Core.Types (POS(NP))
81 import Gargantext.Core.Types.Individu (User(..))
82 import Gargantext.Core.Types.Main
83 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
84 import Gargantext.Database.Action.Flow.List
85 import Gargantext.Database.Action.Flow.Types
86 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams, DocumentIdWithNgrams(..))
87 import Gargantext.Database.Action.Search (searchDocInDatabase)
88 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
89 import Gargantext.Database.Admin.Types.Hyperdata
90 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
91 import Gargantext.Database.Prelude
92 import Gargantext.Database.Query.Table.ContextNodeNgrams2
93 import Gargantext.Database.Query.Table.Ngrams
94 import Gargantext.Database.Query.Table.Node
95 import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
96 import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
97 import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
98 import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
99 import Gargantext.Database.Schema.Node (NodePoly(..), node_id)
100 import Gargantext.Database.Types
101 import Gargantext.Prelude
102 import Gargantext.Prelude.Crypto.Hash (Hash)
103 import qualified Gargantext.Core.Text.Corpus.API as API
104 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
105
106 ------------------------------------------------------------------------
107 -- Imports for upgrade function
108 import Gargantext.Database.Query.Tree.Root (getRootId)
109 import Gargantext.Database.Query.Tree (findNodesId)
110 import qualified Data.List as List
111 ------------------------------------------------------------------------
112 -- TODO use internal with API name (could be old data)
113 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
114 | ExternalOrigin { _do_api :: API.ExternalAPIs }
115 -- TODO Web
116 deriving (Generic, Eq)
117
118 makeLenses ''DataOrigin
119 deriveJSON (unPrefix "_do_") ''DataOrigin
120 instance ToSchema DataOrigin where
121 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
122
123 allDataOrigins :: [DataOrigin]
124 allDataOrigins = map InternalOrigin API.externalAPIs
125 <> map ExternalOrigin API.externalAPIs
126
127 ---------------
128 data DataText = DataOld ![NodeId]
129 | DataNew ![[HyperdataDocument]]
130
131 -- TODO use the split parameter in config file
132 getDataText :: FlowCmdM env err m
133 => DataOrigin
134 -> TermType Lang
135 -> API.Query
136 -> Maybe API.Limit
137 -> m DataText
138 getDataText (ExternalOrigin api) la q li = liftBase $ DataNew
139 <$> splitEvery 500
140 <$> API.get api (_tt_lang la) q li
141
142 getDataText (InternalOrigin _) _la q _li = do
143 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
144 (UserName userMaster)
145 (Left "")
146 (Nothing :: Maybe HyperdataCorpus)
147 ids <- map fst <$> searchDocInDatabase cId (stemIt q)
148 pure $ DataOld ids
149
150 -------------------------------------------------------------------------------
151 flowDataText :: ( FlowCmdM env err m
152 )
153 => User
154 -> DataText
155 -> TermType Lang
156 -> CorpusId
157 -> Maybe FlowSocialListWith
158 -> (JobLog -> m ())
159 -> m CorpusId
160 flowDataText u (DataOld ids) tt cid mfslw _ = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids mfslw
161 where
162 corpusType = (Nothing :: Maybe HyperdataCorpus)
163 flowDataText u (DataNew txt) tt cid mfslw logStatus = flowCorpus u (Right [cid]) tt mfslw txt logStatus
164
165 ------------------------------------------------------------------------
166 -- TODO use proxy
167 flowAnnuaire :: (FlowCmdM env err m)
168 => User
169 -> Either CorpusName [CorpusId]
170 -> (TermType Lang)
171 -> FilePath
172 -> (JobLog -> m ())
173 -> m AnnuaireId
174 flowAnnuaire u n l filePath logStatus = do
175 docs <- liftBase $ (( splitEvery 500 <$> readFile_Annuaire filePath) :: IO [[HyperdataContact]])
176 flow (Nothing :: Maybe HyperdataAnnuaire) u n l Nothing docs logStatus
177
178 ------------------------------------------------------------------------
179 flowCorpusFile :: (FlowCmdM env err m)
180 => User
181 -> Either CorpusName [CorpusId]
182 -> Limit -- Limit the number of docs (for dev purpose)
183 -> TermType Lang -> FileFormat -> FilePath
184 -> Maybe FlowSocialListWith
185 -> (JobLog -> m ())
186 -> m CorpusId
187 flowCorpusFile u n l la ff fp mfslw logStatus = do
188 eParsed <- liftBase $ parseFile ff fp
189 case eParsed of
190 Right parsed -> do
191 let docs = splitEvery 500 $ take l parsed
192 flowCorpus u n la mfslw (map (map toHyperdataDocument) docs) logStatus
193 Left e -> panic $ "Error: " <> (T.pack e)
194
195 ------------------------------------------------------------------------
196 -- | TODO improve the needed type to create/update a corpus
197 -- (For now, Either is enough)
198 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
199 => User
200 -> Either CorpusName [CorpusId]
201 -> TermType Lang
202 -> Maybe FlowSocialListWith
203 -> [[a]]
204 -> (JobLog -> m ())
205 -> m CorpusId
206 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
207
208
209 flow :: ( FlowCmdM env err m
210 , FlowCorpus a
211 , MkCorpus c
212 )
213 => Maybe c
214 -> User
215 -> Either CorpusName [CorpusId]
216 -> TermType Lang
217 -> Maybe FlowSocialListWith
218 -> [[a]]
219 -> (JobLog -> m ())
220 -> m CorpusId
221 flow c u cn la mfslw docs logStatus = do
222 -- TODO if public insertMasterDocs else insertUserDocs
223 ids <- traverse (\(idx, doc) -> do
224 id <- insertMasterDocs c la doc
225 logStatus JobLog { _scst_succeeded = Just $ 1 + idx
226 , _scst_failed = Just 0
227 , _scst_remaining = Just $ length docs - idx
228 , _scst_events = Just []
229 }
230 pure id
231 ) (zip [1..] docs)
232 flowCorpusUser (la ^. tt_lang) u cn c (concat ids) mfslw
233
234
235
236
237 ------------------------------------------------------------------------
238 flowCorpusUser :: ( FlowCmdM env err m
239 , MkCorpus c
240 )
241 => Lang
242 -> User
243 -> Either CorpusName [CorpusId]
244 -> Maybe c
245 -> [NodeId]
246 -> Maybe FlowSocialListWith
247 -> m CorpusId
248 flowCorpusUser l user corpusName ctype ids mfslw = do
249 -- User Flow
250 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
251 -- NodeTexts is first
252 _tId <- insertDefaultNode NodeTexts userCorpusId userId
253 -- printDebug "NodeTexts: " tId
254
255 -- NodeList is second
256 listId <- getOrMkList userCorpusId userId
257 -- _cooc <- insertDefaultNode NodeListCooc listId userId
258 -- TODO: check if present already, ignore
259 _ <- Doc.add userCorpusId ids
260
261 -- printDebug "Node Text Ids:" tId
262
263 -- User List Flow
264 (masterUserId, _masterRootId, masterCorpusId)
265 <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
266
267 --let gp = (GroupParams l 2 3 (StopSize 3))
268 -- Here the PosTagAlgo should be chosen according the Lang
269 let gp = GroupWithPosTag l CoreNLP HashMap.empty
270 ngs <- buildNgramsLists user userCorpusId masterCorpusId mfslw gp
271
272 -- printDebug "flowCorpusUser:ngs" ngs
273
274 _userListId <- flowList_DbRepo listId ngs
275 _mastListId <- getOrMkList masterCorpusId masterUserId
276 -- _ <- insertOccsUpdates userCorpusId mastListId
277 -- printDebug "userListId" userListId
278 -- User Graph Flow
279 _ <- insertDefaultNode NodeDashboard userCorpusId userId
280 _ <- insertDefaultNode NodeGraph userCorpusId userId
281 --_ <- mkPhylo userCorpusId userId
282 -- Annuaire Flow
283 -- _ <- mkAnnuaire rootUserId userId
284 pure userCorpusId
285
286
287 insertMasterDocs :: ( FlowCmdM env err m
288 , FlowCorpus a
289 , MkCorpus c
290 )
291 => Maybe c
292 -> TermType Lang
293 -> [a]
294 -> m [DocId]
295 insertMasterDocs c lang hs = do
296 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
297 (ids', documentsWithId) <- insertDocs masterUserId masterCorpusId (map (toNode masterUserId masterCorpusId) hs )
298 _ <- Doc.add masterCorpusId ids'
299 -- TODO
300 -- create a corpus with database name (CSV or PubMed)
301 -- add documents to the corpus (create node_node link)
302 -- this will enable global database monitoring
303
304 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
305 mapNgramsDocs' :: HashMap ExtractedNgrams (Map NgramsType (Map NodeId Int))
306 <- mapNodeIdNgrams
307 <$> documentIdWithNgrams
308 (extractNgramsT $ withLang lang documentsWithId)
309 documentsWithId
310
311 lId <- getOrMkList masterCorpusId masterUserId
312 _ <- saveDocNgramsWith lId mapNgramsDocs'
313
314 -- _cooc <- insertDefaultNode NodeListCooc lId masterUserId
315 pure ids'
316
317 saveDocNgramsWith :: ( FlowCmdM env err m)
318 => ListId
319 -> HashMap ExtractedNgrams (Map NgramsType (Map NodeId Int))
320 -> m ()
321 saveDocNgramsWith lId mapNgramsDocs' = do
322 terms2id <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
323 let mapNgramsDocs = HashMap.mapKeys extracted2ngrams mapNgramsDocs'
324
325 -- new
326 mapCgramsId <- listInsertDb lId toNodeNgramsW'
327 $ map (first _ngramsTerms . second Map.keys)
328 $ HashMap.toList mapNgramsDocs
329
330 -- printDebug "saveDocNgramsWith" mapCgramsId
331 -- insertDocNgrams
332 _return <- insertContextNodeNgrams2
333 $ catMaybes [ ContextNodeNgrams2 <$> Just nId
334 <*> (getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms''))
335 <*> Just (fromIntegral w :: Double)
336 | (terms'', mapNgramsTypes) <- HashMap.toList mapNgramsDocs
337 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
338 , (nId, w) <- Map.toList mapNodeIdWeight
339 ]
340
341 -- to be removed
342 _ <- insertDocNgrams lId $ HashMap.mapKeys (indexNgrams terms2id) mapNgramsDocs
343
344 pure ()
345
346
347 ------------------------------------------------------------------------
348 -- TODO Type NodeDocumentUnicised
349 insertDocs :: ( FlowCmdM env err m
350 -- , FlowCorpus a
351 , FlowInsertDB a
352 )
353 => UserId
354 -> CorpusId
355 -> [a]
356 -> m ([ContextId], [Indexed ContextId a])
357 insertDocs uId cId hs = do
358 let docs = map addUniqId hs
359 newIds <- insertDb uId cId docs
360 -- printDebug "newIds" newIds
361 let
362 newIds' = map reId newIds
363 documentsWithId = mergeData (toInserted newIds) (Map.fromList $ map viewUniqId' docs)
364 _ <- Doc.add cId newIds'
365 pure (newIds', documentsWithId)
366
367
368 ------------------------------------------------------------------------
369 viewUniqId' :: UniqId a
370 => a
371 -> (Hash, a)
372 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
373 where
374 err = panic "[ERROR] Database.Flow.toInsert"
375
376
377 toInserted :: [ReturnId]
378 -> Map Hash ReturnId
379 toInserted =
380 Map.fromList . map (\r -> (reUniqId r, r) )
381 . filter (\r -> reInserted r == True)
382
383 mergeData :: Map Hash ReturnId
384 -> Map Hash a
385 -> [Indexed NodeId a]
386 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
387 where
388 toDocumentWithId (sha,hpd) =
389 Indexed <$> fmap reId (lookup sha rs)
390 <*> Just hpd
391
392 ------------------------------------------------------------------------
393 ------------------------------------------------------------------------
394 ------------------------------------------------------------------------
395 documentIdWithNgrams :: HasNodeError err
396 => (a
397 -> Cmd err (HashMap b (Map NgramsType Int)))
398 -> [Indexed NodeId a]
399 -> Cmd err [DocumentIdWithNgrams a b]
400 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
401 where
402 toDocumentIdWithNgrams d = do
403 e <- f $ _unIndex d
404 pure $ DocumentIdWithNgrams d e
405
406
407 -- | TODO check optimization
408 mapNodeIdNgrams :: (Ord b, Hashable b)
409 => [DocumentIdWithNgrams a b]
410 -> HashMap b
411 (Map NgramsType
412 (Map NodeId Int)
413 )
414 mapNodeIdNgrams = HashMap.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
415 where
416 f :: DocumentIdWithNgrams a b
417 -> HashMap b (Map NgramsType (Map NodeId Int))
418 f d = fmap (fmap (Map.singleton nId)) $ documentNgrams d
419 where
420 nId = _index $ documentWithId d
421
422
423 ------------------------------------------------------------------------
424 instance ExtractNgramsT HyperdataContact
425 where
426 extractNgramsT l hc = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extract l hc
427 where
428 extract :: TermType Lang -> HyperdataContact
429 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
430 extract _l hc' = do
431 let authors = map text2ngrams
432 $ maybe ["Nothing"] (\a -> [a])
433 $ view (hc_who . _Just . cw_lastName) hc'
434
435 pure $ HashMap.fromList $ [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ]
436
437
438 instance ExtractNgramsT HyperdataDocument
439 where
440 extractNgramsT :: TermType Lang
441 -> HyperdataDocument
442 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
443 extractNgramsT lang hd = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extractNgramsT' lang hd
444 where
445 extractNgramsT' :: TermType Lang
446 -> HyperdataDocument
447 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
448 extractNgramsT' lang' doc = do
449 let source = text2ngrams
450 $ maybe "Nothing" identity
451 $ _hd_source doc
452
453 institutes = map text2ngrams
454 $ maybe ["Nothing"] (map toSchoolName . (T.splitOn ", "))
455 $ _hd_institutes doc
456
457 authors = map text2ngrams
458 $ maybe ["Nothing"] (T.splitOn ", ")
459 $ _hd_authors doc
460
461 terms' <- map (enrichedTerms (lang' ^. tt_lang) CoreNLP NP)
462 <$> concat
463 <$> liftBase (extractTerms lang' $ hasText doc)
464
465 pure $ HashMap.fromList
466 $ [(SimpleNgrams source, Map.singleton Sources 1) ]
467 <> [(SimpleNgrams i', Map.singleton Institutes 1) | i' <- institutes ]
468 <> [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ]
469 <> [(EnrichedNgrams t', Map.singleton NgramsTerms 1) | t' <- terms' ]
470
471 instance (ExtractNgramsT a, HasText a) => ExtractNgramsT (Node a)
472 where
473 extractNgramsT l (Node _ _ _ _ _ _ _ h) = extractNgramsT l h
474
475 instance HasText a => HasText (Node a)
476 where
477 hasText (Node _ _ _ _ _ _ _ h) = hasText h
478
479
480
481 -- | TODO putelsewhere
482 -- | Upgrade function
483 -- Suppose all documents are English (this is the case actually)
484 indexAllDocumentsWithPosTag :: FlowCmdM env err m
485 => m ()
486 indexAllDocumentsWithPosTag = do
487 rootId <- getRootId (UserName userMaster)
488 corpusIds <- findNodesId rootId [NodeCorpus]
489 docs <- List.concat <$> mapM getDocumentsWithParentId corpusIds
490 _ <- mapM extractInsert (splitEvery 1000 docs)
491 pure ()
492
493 extractInsert :: FlowCmdM env err m
494 => [Node HyperdataDocument] -> m ()
495 extractInsert docs = do
496 let documentsWithId = map (\doc -> Indexed (doc ^. node_id) doc) docs
497 mapNgramsDocs' <- mapNodeIdNgrams
498 <$> documentIdWithNgrams
499 (extractNgramsT $ withLang (Multi EN) documentsWithId)
500 documentsWithId
501 _ <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
502 pure ()
503
504