]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Flow.hs
Merge branch 'dev' into dev-corpora-from-write-nodes
[gargantext.git] / src / Gargantext / Database / Action / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -- TODO-ACCESS:
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
13
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
16 -}
17
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
19
20 {-# LANGUAGE ConstrainedClassMethods #-}
21 {-# LANGUAGE ConstraintKinds #-}
22 {-# LANGUAGE InstanceSigs #-}
23 {-# LANGUAGE ScopedTypeVariables #-}
24 {-# LANGUAGE TemplateHaskell #-}
25
26 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
27 ( getDataText
28 , flowDataText
29 , flow
30
31 , flowCorpusFile
32 , flowCorpus
33 , flowAnnuaire
34 , insertMasterDocs
35 , saveDocNgramsWith
36
37 , getOrMkRoot
38 , getOrMk_RootWithCorpus
39 , TermType(..)
40 , DataOrigin(..)
41 , allDataOrigins
42
43 , do_api
44 , indexAllDocumentsWithPosTag
45 )
46 where
47
48 import Control.Lens ((^.), view, _Just, makeLenses)
49 import Data.Aeson.TH (deriveJSON)
50 import Data.Either
51 import Data.HashMap.Strict (HashMap)
52 import Data.Hashable (Hashable)
53 import Data.List (concat)
54 import Data.Map (Map, lookup)
55 import Data.Maybe (catMaybes)
56 import Data.Monoid
57 import Data.Swagger
58 import qualified Data.Text as T
59 import Data.Traversable (traverse)
60 import Data.Tuple.Extra (first, second)
61 import GHC.Generics (Generic)
62 import System.FilePath (FilePath)
63 import qualified Data.HashMap.Strict as HashMap
64 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
65 import qualified Data.Map as Map
66
67 import Gargantext.Core (Lang(..), PosTagAlgo(..))
68 import Gargantext.Core.Ext.IMT (toSchoolName)
69 import Gargantext.Core.Ext.IMTUser (readFile_Annuaire)
70 import Gargantext.Core.Flow.Types
71 import Gargantext.Core.Text
72 import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
73 import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat)
74 import Gargantext.Core.Text.List (buildNgramsLists)
75 import Gargantext.Core.Text.List.Social (FlowSocialListWith)
76 import Gargantext.Core.Text.Terms
77 import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
78 import Gargantext.Core.Types (POS(NP))
79 import Gargantext.Core.Types.Individu (User(..))
80 import Gargantext.Core.Types.Main
81 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
82 import Gargantext.Database.Action.Flow.List
83 import Gargantext.Database.Action.Flow.Types
84 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams, DocumentIdWithNgrams(..))
85 import Gargantext.Database.Action.Search (searchDocInDatabase)
86 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
87 import Gargantext.Database.Admin.Types.Hyperdata
88 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
89 import Gargantext.Database.Prelude
90 import Gargantext.Database.Query.Table.Ngrams
91 import Gargantext.Database.Query.Table.Node
92 import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
93 import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
94 import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
95 import Gargantext.Database.Query.Table.NodeNodeNgrams2
96 import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
97 import Gargantext.Database.Schema.Node (NodePoly(..), node_id)
98 import Gargantext.Database.Types
99 import Gargantext.Prelude
100 import Gargantext.Prelude.Crypto.Hash (Hash)
101 import qualified Gargantext.Core.Text.Corpus.API as API
102 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
103
104 ------------------------------------------------------------------------
105 -- Impots for upgrade function
106 import Gargantext.Database.Query.Tree.Root (getRootId)
107 import Gargantext.Database.Query.Tree (findNodesId)
108 import qualified Data.List as List
109 ------------------------------------------------------------------------
110 -- TODO use internal with API name (could be old data)
111 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
112 | ExternalOrigin { _do_api :: API.ExternalAPIs }
113 -- TODO Web
114 deriving (Generic, Eq)
115
116 makeLenses ''DataOrigin
117 deriveJSON (unPrefix "_do_") ''DataOrigin
118 instance ToSchema DataOrigin where
119 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
120
121 allDataOrigins :: [DataOrigin]
122 allDataOrigins = map InternalOrigin API.externalAPIs
123 <> map ExternalOrigin API.externalAPIs
124
125 ---------------
126 data DataText = DataOld ![NodeId]
127 | DataNew ![[HyperdataDocument]]
128
129 -- TODO use the split parameter in config file
130 getDataText :: FlowCmdM env err m
131 => DataOrigin
132 -> TermType Lang
133 -> API.Query
134 -> Maybe API.Limit
135 -> m DataText
136 getDataText (ExternalOrigin api) la q li = liftBase $ DataNew
137 <$> splitEvery 500
138 <$> API.get api (_tt_lang la) q li
139
140 getDataText (InternalOrigin _) _la q _li = do
141 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
142 (UserName userMaster)
143 (Left "")
144 (Nothing :: Maybe HyperdataCorpus)
145 ids <- map fst <$> searchDocInDatabase cId (stemIt q)
146 pure $ DataOld ids
147
148 -------------------------------------------------------------------------------
149 flowDataText :: ( FlowCmdM env err m
150 )
151 => User
152 -> DataText
153 -> TermType Lang
154 -> CorpusId
155 -> Maybe FlowSocialListWith
156 -> m CorpusId
157 flowDataText u (DataOld ids) tt cid mfslw = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids mfslw
158 where
159 corpusType = (Nothing :: Maybe HyperdataCorpus)
160 flowDataText u (DataNew txt) tt cid mfslw = flowCorpus u (Right [cid]) tt mfslw txt
161
162 ------------------------------------------------------------------------
163 -- TODO use proxy
164 flowAnnuaire :: (FlowCmdM env err m)
165 => User
166 -> Either CorpusName [CorpusId]
167 -> (TermType Lang)
168 -> FilePath
169 -> m AnnuaireId
170 flowAnnuaire u n l filePath = do
171 docs <- liftBase $ (( splitEvery 500 <$> readFile_Annuaire filePath) :: IO [[HyperdataContact]])
172 flow (Nothing :: Maybe HyperdataAnnuaire) u n l Nothing docs
173
174 ------------------------------------------------------------------------
175 flowCorpusFile :: (FlowCmdM env err m)
176 => User
177 -> Either CorpusName [CorpusId]
178 -> Limit -- Limit the number of docs (for dev purpose)
179 -> TermType Lang -> FileFormat -> FilePath
180 -> Maybe FlowSocialListWith
181 -> m CorpusId
182 flowCorpusFile u n l la ff fp mfslw = do
183 eParsed <- liftBase $ parseFile ff fp
184 case eParsed of
185 Right parsed -> do
186 let docs = splitEvery 500 $ take l parsed
187 flowCorpus u n la mfslw (map (map toHyperdataDocument) docs)
188 Left e -> panic $ "Error: " <> (T.pack e)
189
190 ------------------------------------------------------------------------
191 -- | TODO improve the needed type to create/update a corpus
192 -- (For now, Either is enough)
193 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
194 => User
195 -> Either CorpusName [CorpusId]
196 -> TermType Lang
197 -> Maybe FlowSocialListWith
198 -> [[a]]
199 -> m CorpusId
200 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
201
202
203 flow :: ( FlowCmdM env err m
204 , FlowCorpus a
205 , MkCorpus c
206 )
207 => Maybe c
208 -> User
209 -> Either CorpusName [CorpusId]
210 -> TermType Lang
211 -> Maybe FlowSocialListWith
212 -> [[a]]
213 -> m CorpusId
214 flow c u cn la mfslw docs = do
215 -- TODO if public insertMasterDocs else insertUserDocs
216 ids <- traverse (insertMasterDocs c la) docs
217 flowCorpusUser (la ^. tt_lang) u cn c (concat ids) mfslw
218
219 ------------------------------------------------------------------------
220 flowCorpusUser :: ( FlowCmdM env err m
221 , MkCorpus c
222 )
223 => Lang
224 -> User
225 -> Either CorpusName [CorpusId]
226 -> Maybe c
227 -> [NodeId]
228 -> Maybe FlowSocialListWith
229 -> m CorpusId
230 flowCorpusUser l user corpusName ctype ids mfslw = do
231 -- User Flow
232 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
233 -- NodeTexts is first
234 _tId <- insertDefaultNode NodeTexts userCorpusId userId
235 -- printDebug "NodeTexts: " tId
236
237 -- NodeList is second
238 listId <- getOrMkList userCorpusId userId
239 -- _cooc <- insertDefaultNode NodeListCooc listId userId
240 -- TODO: check if present already, ignore
241 _ <- Doc.add userCorpusId ids
242
243 -- printDebug "Node Text Ids:" tId
244
245 -- User List Flow
246 (masterUserId, _masterRootId, masterCorpusId)
247 <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
248
249 --let gp = (GroupParams l 2 3 (StopSize 3))
250 let gp = GroupWithPosTag l CoreNLP HashMap.empty
251 ngs <- buildNgramsLists user userCorpusId masterCorpusId mfslw gp
252
253 _userListId <- flowList_DbRepo listId ngs
254 _mastListId <- getOrMkList masterCorpusId masterUserId
255 -- _ <- insertOccsUpdates userCorpusId mastListId
256 -- printDebug "userListId" userListId
257 -- User Graph Flow
258 _ <- insertDefaultNode NodeDashboard userCorpusId userId
259 _ <- insertDefaultNode NodeGraph userCorpusId userId
260 --_ <- mkPhylo userCorpusId userId
261 -- Annuaire Flow
262 -- _ <- mkAnnuaire rootUserId userId
263 pure userCorpusId
264
265
266 insertMasterDocs :: ( FlowCmdM env err m
267 , FlowCorpus a
268 , MkCorpus c
269 )
270 => Maybe c
271 -> TermType Lang
272 -> [a]
273 -> m [DocId]
274 insertMasterDocs c lang hs = do
275 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
276 (ids', documentsWithId) <- insertDocs masterUserId masterCorpusId (map (toNode masterUserId masterCorpusId) hs )
277 _ <- Doc.add masterCorpusId ids'
278 -- TODO
279 -- create a corpus with database name (CSV or PubMed)
280 -- add documents to the corpus (create node_node link)
281 -- this will enable global database monitoring
282
283 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
284 mapNgramsDocs' :: HashMap ExtractedNgrams (Map NgramsType (Map NodeId Int))
285 <- mapNodeIdNgrams
286 <$> documentIdWithNgrams
287 (extractNgramsT $ withLang lang documentsWithId)
288 documentsWithId
289
290 lId <- getOrMkList masterCorpusId masterUserId
291 _ <- saveDocNgramsWith lId mapNgramsDocs'
292
293 -- _cooc <- insertDefaultNode NodeListCooc lId masterUserId
294 pure ids'
295
296 saveDocNgramsWith :: ( FlowCmdM env err m)
297 => ListId
298 -> HashMap ExtractedNgrams (Map NgramsType (Map NodeId Int))
299 -> m ()
300 saveDocNgramsWith lId mapNgramsDocs' = do
301 terms2id <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
302 let mapNgramsDocs = HashMap.mapKeys extracted2ngrams mapNgramsDocs'
303
304 -- to be removed
305 let indexedNgrams = HashMap.mapKeys (indexNgrams terms2id) mapNgramsDocs
306
307 -- new
308 mapCgramsId <- listInsertDb lId toNodeNgramsW'
309 $ map (first _ngramsTerms . second Map.keys)
310 $ HashMap.toList mapNgramsDocs
311
312 -- insertDocNgrams
313 _return <- insertNodeNodeNgrams2
314 $ catMaybes [ NodeNodeNgrams2 <$> Just nId
315 <*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms'')
316 <*> Just (fromIntegral w :: Double)
317 | (terms'', mapNgramsTypes) <- HashMap.toList mapNgramsDocs
318 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
319 , (nId, w) <- Map.toList mapNodeIdWeight
320 ]
321 -- to be removed
322 _ <- insertDocNgrams lId indexedNgrams
323
324 pure ()
325
326
327 ------------------------------------------------------------------------
328 -- TODO Type NodeDocumentUnicised
329 insertDocs :: ( FlowCmdM env err m
330 -- , FlowCorpus a
331 , FlowInsertDB a
332 )
333 => UserId
334 -> CorpusId
335 -> [a]
336 -> m ([DocId], [Indexed NodeId a])
337 insertDocs uId cId hs = do
338 let docs = map addUniqId hs
339 newIds <- insertDb uId cId docs
340 -- printDebug "newIds" newIds
341 let
342 newIds' = map reId newIds
343 documentsWithId = mergeData (toInserted newIds) (Map.fromList $ map viewUniqId' docs)
344 _ <- Doc.add cId newIds'
345 pure (newIds', documentsWithId)
346
347
348 ------------------------------------------------------------------------
349 viewUniqId' :: UniqId a
350 => a
351 -> (Hash, a)
352 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
353 where
354 err = panic "[ERROR] Database.Flow.toInsert"
355
356
357 toInserted :: [ReturnId]
358 -> Map Hash ReturnId
359 toInserted =
360 Map.fromList . map (\r -> (reUniqId r, r) )
361 . filter (\r -> reInserted r == True)
362
363 mergeData :: Map Hash ReturnId
364 -> Map Hash a
365 -> [Indexed NodeId a]
366 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
367 where
368 toDocumentWithId (sha,hpd) =
369 Indexed <$> fmap reId (lookup sha rs)
370 <*> Just hpd
371
372 ------------------------------------------------------------------------
373 ------------------------------------------------------------------------
374 ------------------------------------------------------------------------
375 documentIdWithNgrams :: HasNodeError err
376 => (a
377 -> Cmd err (HashMap b (Map NgramsType Int)))
378 -> [Indexed NodeId a]
379 -> Cmd err [DocumentIdWithNgrams a b]
380 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
381 where
382 toDocumentIdWithNgrams d = do
383 e <- f $ _unIndex d
384 pure $ DocumentIdWithNgrams d e
385
386
387 -- | TODO check optimization
388 mapNodeIdNgrams :: (Ord b, Hashable b)
389 => [DocumentIdWithNgrams a b]
390 -> HashMap b
391 (Map NgramsType
392 (Map NodeId Int)
393 )
394 mapNodeIdNgrams = HashMap.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
395 where
396 f :: DocumentIdWithNgrams a b
397 -> HashMap b (Map NgramsType (Map NodeId Int))
398 f d = fmap (fmap (Map.singleton nId)) $ documentNgrams d
399 where
400 nId = _index $ documentWithId d
401
402
403 ------------------------------------------------------------------------
404 instance ExtractNgramsT HyperdataContact
405 where
406 extractNgramsT l hc = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extract l hc
407 where
408 extract :: TermType Lang -> HyperdataContact
409 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
410 extract _l hc' = do
411 let authors = map text2ngrams
412 $ maybe ["Nothing"] (\a -> [a])
413 $ view (hc_who . _Just . cw_lastName) hc'
414
415 pure $ HashMap.fromList $ [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ]
416
417
418 instance ExtractNgramsT HyperdataDocument
419 where
420 extractNgramsT :: TermType Lang
421 -> HyperdataDocument
422 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
423 extractNgramsT lang hd = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extractNgramsT' lang hd
424 where
425 extractNgramsT' :: TermType Lang
426 -> HyperdataDocument
427 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
428 extractNgramsT' lang' doc = do
429 let source = text2ngrams
430 $ maybe "Nothing" identity
431 $ _hd_source doc
432
433 institutes = map text2ngrams
434 $ maybe ["Nothing"] (map toSchoolName . (T.splitOn ", "))
435 $ _hd_institutes doc
436
437 authors = map text2ngrams
438 $ maybe ["Nothing"] (T.splitOn ", ")
439 $ _hd_authors doc
440
441 terms' <- map (enrichedTerms (lang' ^. tt_lang) CoreNLP NP)
442 <$> concat
443 <$> liftBase (extractTerms lang' $ hasText doc)
444
445 pure $ HashMap.fromList
446 $ [(SimpleNgrams source, Map.singleton Sources 1) ]
447 <> [(SimpleNgrams i', Map.singleton Institutes 1) | i' <- institutes ]
448 <> [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ]
449 <> [(EnrichedNgrams t', Map.singleton NgramsTerms 1) | t' <- terms' ]
450
451 instance (ExtractNgramsT a, HasText a) => ExtractNgramsT (Node a)
452 where
453 extractNgramsT l (Node _ _ _ _ _ _ _ h) = extractNgramsT l h
454
455 instance HasText a => HasText (Node a)
456 where
457 hasText (Node _ _ _ _ _ _ _ h) = hasText h
458
459
460
461 -- | TODO putelsewhere
462 -- | Upgrade function
463 -- Suppose all documents are English (this is the case actually)
464 indexAllDocumentsWithPosTag :: FlowCmdM env err m => m ()
465 indexAllDocumentsWithPosTag = do
466 rootId <- getRootId (UserName userMaster)
467 corpusIds <- findNodesId rootId [NodeCorpus]
468 docs <- List.concat <$> mapM getDocumentsWithParentId corpusIds
469
470 _ <- mapM extractInsert (splitEvery 1000 docs)
471
472 pure ()
473
474 extractInsert :: FlowCmdM env err m => [Node HyperdataDocument] -> m ()
475 extractInsert docs = do
476 let documentsWithId = map (\doc -> Indexed (doc ^. node_id) doc) docs
477
478 mapNgramsDocs' <- mapNodeIdNgrams
479 <$> documentIdWithNgrams
480 (extractNgramsT $ withLang (Multi EN) documentsWithId)
481 documentsWithId
482
483 _ <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
484
485 pure ()
486
487