]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Flow.hs
Merge branch '70-dev-searx-parser' of ssh://gitlab.iscpif.fr:20022/gargantext/haskell...
[gargantext.git] / src / Gargantext / Database / Action / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -- TODO-ACCESS:
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
13
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
16 -}
17
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
19
20 {-# LANGUAGE ConstrainedClassMethods #-}
21 {-# LANGUAGE ConstraintKinds #-}
22 {-# LANGUAGE InstanceSigs #-}
23 {-# LANGUAGE ScopedTypeVariables #-}
24 {-# LANGUAGE TemplateHaskell #-}
25
26 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
27 ( FlowCmdM
28 , getDataText
29 , flowDataText
30 , flow
31
32 , flowCorpusFile
33 , flowCorpus
34 , flowAnnuaire
35 , insertMasterDocs
36 , saveDocNgramsWith
37
38 , getOrMkRoot
39 , getOrMk_RootWithCorpus
40 , TermType(..)
41 , DataOrigin(..)
42 , allDataOrigins
43
44 , do_api
45 , indexAllDocumentsWithPosTag
46 )
47 where
48
49 import Control.Lens ((^.), view, _Just, makeLenses)
50 import Data.Aeson.TH (deriveJSON)
51 import Data.Either
52 import Data.HashMap.Strict (HashMap)
53 import Data.Hashable (Hashable)
54 import Data.List (concat)
55 import Data.Map (Map, lookup)
56 import Data.Maybe (catMaybes)
57 import Data.Monoid
58 import Data.Swagger
59 import qualified Data.Text as T
60 import Data.Traversable (traverse)
61 import Data.Tuple.Extra (first, second)
62 import GHC.Generics (Generic)
63 import System.FilePath (FilePath)
64 import qualified Data.HashMap.Strict as HashMap
65 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
66 import qualified Data.Map as Map
67
68 import Gargantext.Core (Lang(..), PosTagAlgo(..))
69 import Gargantext.Core.Ext.IMT (toSchoolName)
70 import Gargantext.Core.Ext.IMTUser (readFile_Annuaire)
71 import Gargantext.Core.Flow.Types
72 import Gargantext.Core.Text
73 import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
74 import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat)
75 import Gargantext.Core.Text.List (buildNgramsLists)
76 import Gargantext.Core.Text.Terms
77 import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
78 import Gargantext.Core.Types (POS(NP))
79 import Gargantext.Core.Types.Individu (User(..))
80 import Gargantext.Core.Types.Main
81 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
82 import Gargantext.Database.Action.Flow.List
83 import Gargantext.Database.Action.Flow.Types
84 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams, DocumentIdWithNgrams(..))
85 import Gargantext.Database.Action.Search (searchDocInDatabase)
86 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
87 import Gargantext.Database.Admin.Types.Hyperdata
88 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
89 import Gargantext.Database.Prelude
90 import Gargantext.Database.Query.Table.Ngrams
91 import Gargantext.Database.Query.Table.Node
92 import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
93 import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
94 import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
95 import Gargantext.Database.Query.Table.NodeNodeNgrams2
96 import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
97 import Gargantext.Database.Schema.Node (NodePoly(..), node_id)
98 import Gargantext.Database.Types
99 import Gargantext.Prelude
100 import Gargantext.Prelude.Crypto.Hash (Hash)
101 import qualified Gargantext.Core.Text.Corpus.API as API
102 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
103
104 ------------------------------------------------------------------------
105 -- Impots for upgrade function
106 import Gargantext.Database.Query.Tree.Root (getRootId)
107 import Gargantext.Database.Query.Tree (findNodesId)
108 import qualified Data.List as List
109 ------------------------------------------------------------------------
110 -- TODO use internal with API name (could be old data)
111 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
112 | ExternalOrigin { _do_api :: API.ExternalAPIs }
113 -- TODO Web
114 deriving (Generic, Eq)
115
116 makeLenses ''DataOrigin
117 deriveJSON (unPrefix "_do_") ''DataOrigin
118 instance ToSchema DataOrigin where
119 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
120
121 allDataOrigins :: [DataOrigin]
122 allDataOrigins = map InternalOrigin API.externalAPIs
123 <> map ExternalOrigin API.externalAPIs
124
125 ---------------
126 data DataText = DataOld ![NodeId]
127 | DataNew ![[HyperdataDocument]]
128
129 -- TODO use the split parameter in config file
130 getDataText :: FlowCmdM env err m
131 => DataOrigin
132 -> TermType Lang
133 -> API.Query
134 -> Maybe API.Limit
135 -> m DataText
136 getDataText (ExternalOrigin api) la q li = liftBase $ DataNew
137 <$> splitEvery 500
138 <$> API.get api (_tt_lang la) q li
139
140 getDataText (InternalOrigin _) _la q _li = do
141 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
142 (UserName userMaster)
143 (Left "")
144 (Nothing :: Maybe HyperdataCorpus)
145 ids <- map fst <$> searchDocInDatabase cId (stemIt q)
146 pure $ DataOld ids
147
148 -------------------------------------------------------------------------------
149 flowDataText :: ( FlowCmdM env err m
150 )
151 => User
152 -> DataText
153 -> TermType Lang
154 -> CorpusId
155 -> m CorpusId
156 flowDataText u (DataOld ids) tt cid = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids
157 where
158 corpusType = (Nothing :: Maybe HyperdataCorpus)
159 flowDataText u (DataNew txt) tt cid = flowCorpus u (Right [cid]) tt txt
160
161 ------------------------------------------------------------------------
162 -- TODO use proxy
163 flowAnnuaire :: (FlowCmdM env err m)
164 => User
165 -> Either CorpusName [CorpusId]
166 -> (TermType Lang)
167 -> FilePath
168 -> m AnnuaireId
169 flowAnnuaire u n l filePath = do
170 docs <- liftBase $ (( splitEvery 500 <$> readFile_Annuaire filePath) :: IO [[HyperdataContact]])
171 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
172
173 ------------------------------------------------------------------------
174 flowCorpusFile :: (FlowCmdM env err m)
175 => User
176 -> Either CorpusName [CorpusId]
177 -> Limit -- Limit the number of docs (for dev purpose)
178 -> TermType Lang -> FileFormat -> FilePath
179 -> m CorpusId
180 flowCorpusFile u n l la ff fp = do
181 eParsed <- liftBase $ parseFile ff fp
182 case eParsed of
183 Right parsed -> do
184 let docs = splitEvery 500 $ take l parsed
185 flowCorpus u n la (map (map toHyperdataDocument) docs)
186 Left e -> panic $ "Error: " <> (T.pack e)
187
188 ------------------------------------------------------------------------
189 -- | TODO improve the needed type to create/update a corpus
190 -- (For now, Either is enough)
191 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
192 => User
193 -> Either CorpusName [CorpusId]
194 -> TermType Lang
195 -> [[a]]
196 -> m CorpusId
197 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
198
199
200 flow :: ( FlowCmdM env err m
201 , FlowCorpus a
202 , MkCorpus c
203 )
204 => Maybe c
205 -> User
206 -> Either CorpusName [CorpusId]
207 -> TermType Lang
208 -> [[a]]
209 -> m CorpusId
210 flow c u cn la docs = do
211 -- TODO if public insertMasterDocs else insertUserDocs
212 ids <- traverse (insertMasterDocs c la) docs
213 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
214
215 ------------------------------------------------------------------------
216 flowCorpusUser :: ( FlowCmdM env err m
217 , MkCorpus c
218 )
219 => Lang
220 -> User
221 -> Either CorpusName [CorpusId]
222 -> Maybe c
223 -> [NodeId]
224 -> m CorpusId
225 flowCorpusUser l user corpusName ctype ids = do
226 -- User Flow
227 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
228 -- NodeTexts is first
229 _tId <- insertDefaultNode NodeTexts userCorpusId userId
230 -- printDebug "NodeTexts: " tId
231
232 -- NodeList is second
233 listId <- getOrMkList userCorpusId userId
234 -- _cooc <- insertDefaultNode NodeListCooc listId userId
235 -- TODO: check if present already, ignore
236 _ <- Doc.add userCorpusId ids
237
238 -- printDebug "Node Text Ids:" tId
239
240 -- User List Flow
241 (masterUserId, _masterRootId, masterCorpusId)
242 <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
243
244 --let gp = (GroupParams l 2 3 (StopSize 3))
245 let gp = GroupWithPosTag l CoreNLP HashMap.empty
246 ngs <- buildNgramsLists gp user userCorpusId masterCorpusId
247
248 _userListId <- flowList_DbRepo listId ngs
249 _mastListId <- getOrMkList masterCorpusId masterUserId
250 -- _ <- insertOccsUpdates userCorpusId mastListId
251 -- printDebug "userListId" userListId
252 -- User Graph Flow
253 _ <- insertDefaultNode NodeDashboard userCorpusId userId
254 _ <- insertDefaultNode NodeGraph userCorpusId userId
255 --_ <- mkPhylo userCorpusId userId
256 -- Annuaire Flow
257 -- _ <- mkAnnuaire rootUserId userId
258 pure userCorpusId
259
260
261 insertMasterDocs :: ( FlowCmdM env err m
262 , FlowCorpus a
263 , MkCorpus c
264 )
265 => Maybe c
266 -> TermType Lang
267 -> [a]
268 -> m [DocId]
269 insertMasterDocs c lang hs = do
270 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
271 (ids', documentsWithId) <- insertDocs masterUserId masterCorpusId (map (toNode masterUserId masterCorpusId) hs )
272 _ <- Doc.add masterCorpusId ids'
273 -- TODO
274 -- create a corpus with database name (CSV or PubMed)
275 -- add documents to the corpus (create node_node link)
276 -- this will enable global database monitoring
277
278 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
279 mapNgramsDocs' :: HashMap ExtractedNgrams (Map NgramsType (Map NodeId Int))
280 <- mapNodeIdNgrams
281 <$> documentIdWithNgrams
282 (extractNgramsT $ withLang lang documentsWithId)
283 documentsWithId
284
285 lId <- getOrMkList masterCorpusId masterUserId
286 _ <- saveDocNgramsWith lId mapNgramsDocs'
287
288 -- _cooc <- insertDefaultNode NodeListCooc lId masterUserId
289 pure ids'
290
291 saveDocNgramsWith :: ( FlowCmdM env err m)
292 => ListId
293 -> HashMap ExtractedNgrams (Map NgramsType (Map NodeId Int))
294 -> m ()
295 saveDocNgramsWith lId mapNgramsDocs' = do
296 terms2id <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
297 let mapNgramsDocs = HashMap.mapKeys extracted2ngrams mapNgramsDocs'
298
299 -- to be removed
300 let indexedNgrams = HashMap.mapKeys (indexNgrams terms2id) mapNgramsDocs
301
302 -- new
303 mapCgramsId <- listInsertDb lId toNodeNgramsW'
304 $ map (first _ngramsTerms . second Map.keys)
305 $ HashMap.toList mapNgramsDocs
306
307 -- insertDocNgrams
308 _return <- insertNodeNodeNgrams2
309 $ catMaybes [ NodeNodeNgrams2 <$> Just nId
310 <*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms'')
311 <*> Just (fromIntegral w :: Double)
312 | (terms'', mapNgramsTypes) <- HashMap.toList mapNgramsDocs
313 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
314 , (nId, w) <- Map.toList mapNodeIdWeight
315 ]
316 -- to be removed
317 _ <- insertDocNgrams lId indexedNgrams
318
319 pure ()
320
321
322 ------------------------------------------------------------------------
323 -- TODO Type NodeDocumentUnicised
324 insertDocs :: ( FlowCmdM env err m
325 -- , FlowCorpus a
326 , FlowInsertDB a
327 )
328 => UserId
329 -> CorpusId
330 -> [a]
331 -> m ([DocId], [Indexed NodeId a])
332 insertDocs uId cId hs = do
333 let docs = map addUniqId hs
334 newIds <- insertDb uId cId docs
335 -- printDebug "newIds" newIds
336 let
337 newIds' = map reId newIds
338 documentsWithId = mergeData (toInserted newIds) (Map.fromList $ map viewUniqId' docs)
339 _ <- Doc.add cId newIds'
340 pure (newIds', documentsWithId)
341
342
343 ------------------------------------------------------------------------
344 viewUniqId' :: UniqId a
345 => a
346 -> (Hash, a)
347 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
348 where
349 err = panic "[ERROR] Database.Flow.toInsert"
350
351
352 toInserted :: [ReturnId]
353 -> Map Hash ReturnId
354 toInserted =
355 Map.fromList . map (\r -> (reUniqId r, r) )
356 . filter (\r -> reInserted r == True)
357
358 mergeData :: Map Hash ReturnId
359 -> Map Hash a
360 -> [Indexed NodeId a]
361 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
362 where
363 toDocumentWithId (sha,hpd) =
364 Indexed <$> fmap reId (lookup sha rs)
365 <*> Just hpd
366
367 ------------------------------------------------------------------------
368 ------------------------------------------------------------------------
369 ------------------------------------------------------------------------
370 documentIdWithNgrams :: HasNodeError err
371 => (a
372 -> Cmd err (HashMap b (Map NgramsType Int)))
373 -> [Indexed NodeId a]
374 -> Cmd err [DocumentIdWithNgrams a b]
375 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
376 where
377 toDocumentIdWithNgrams d = do
378 e <- f $ _unIndex d
379 pure $ DocumentIdWithNgrams d e
380
381
382 -- | TODO check optimization
383 mapNodeIdNgrams :: (Ord b, Hashable b)
384 => [DocumentIdWithNgrams a b]
385 -> HashMap b
386 (Map NgramsType
387 (Map NodeId Int)
388 )
389 mapNodeIdNgrams = HashMap.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
390 where
391 f :: DocumentIdWithNgrams a b
392 -> HashMap b (Map NgramsType (Map NodeId Int))
393 f d = fmap (fmap (Map.singleton nId)) $ documentNgrams d
394 where
395 nId = _index $ documentWithId d
396
397
398 ------------------------------------------------------------------------
399 instance ExtractNgramsT HyperdataContact
400 where
401 extractNgramsT l hc = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extract l hc
402 where
403 extract :: TermType Lang -> HyperdataContact
404 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
405 extract _l hc' = do
406 let authors = map text2ngrams
407 $ maybe ["Nothing"] (\a -> [a])
408 $ view (hc_who . _Just . cw_lastName) hc'
409
410 pure $ HashMap.fromList $ [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ]
411
412
413 instance ExtractNgramsT HyperdataDocument
414 where
415 extractNgramsT :: TermType Lang
416 -> HyperdataDocument
417 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
418 extractNgramsT lang hd = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extractNgramsT' lang hd
419 where
420 extractNgramsT' :: TermType Lang
421 -> HyperdataDocument
422 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
423 extractNgramsT' lang' doc = do
424 let source = text2ngrams
425 $ maybe "Nothing" identity
426 $ _hd_source doc
427
428 institutes = map text2ngrams
429 $ maybe ["Nothing"] (map toSchoolName . (T.splitOn ", "))
430 $ _hd_institutes doc
431
432 authors = map text2ngrams
433 $ maybe ["Nothing"] (T.splitOn ", ")
434 $ _hd_authors doc
435
436 terms' <- map (enrichedTerms (lang' ^. tt_lang) CoreNLP NP)
437 <$> concat
438 <$> liftBase (extractTerms lang' $ hasText doc)
439
440 pure $ HashMap.fromList
441 $ [(SimpleNgrams source, Map.singleton Sources 1) ]
442 <> [(SimpleNgrams i', Map.singleton Institutes 1) | i' <- institutes ]
443 <> [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ]
444 <> [(EnrichedNgrams t', Map.singleton NgramsTerms 1) | t' <- terms' ]
445
446 instance (ExtractNgramsT a, HasText a) => ExtractNgramsT (Node a)
447 where
448 extractNgramsT l (Node _ _ _ _ _ _ _ h) = extractNgramsT l h
449
450 instance HasText a => HasText (Node a)
451 where
452 hasText (Node _ _ _ _ _ _ _ h) = hasText h
453
454
455
456 -- | TODO putelsewhere
457 -- | Upgrade function
458 -- Suppose all documents are English (this is the case actually)
459 indexAllDocumentsWithPosTag :: FlowCmdM env err m => m ()
460 indexAllDocumentsWithPosTag = do
461 rootId <- getRootId (UserName userMaster)
462 corpusIds <- findNodesId rootId [NodeCorpus]
463 docs <- List.concat <$> mapM getDocumentsWithParentId corpusIds
464
465 _ <- mapM extractInsert (splitEvery 1000 docs)
466
467 pure ()
468
469 extractInsert :: FlowCmdM env err m => [Node HyperdataDocument] -> m ()
470 extractInsert docs = do
471 let documentsWithId = map (\doc -> Indexed (doc ^. node_id) doc) docs
472
473 mapNgramsDocs' <- mapNodeIdNgrams
474 <$> documentIdWithNgrams
475 (extractNgramsT $ withLang (Multi EN) documentsWithId)
476 documentsWithId
477
478 _ <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
479
480 pure ()
481
482