]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Flow.hs
Merge branch 'dev-kawen' of ssh://gitlab.iscpif.fr:20022/gargantext/haskell-gargantex...
[gargantext.git] / src / Gargantext / Database / Action / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -- TODO-ACCESS:
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
13
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
16 -}
17
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
19
20 {-# LANGUAGE ConstrainedClassMethods #-}
21 {-# LANGUAGE ConstraintKinds #-}
22 {-# LANGUAGE InstanceSigs #-}
23 {-# LANGUAGE ScopedTypeVariables #-}
24 {-# LANGUAGE TemplateHaskell #-}
25
26 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
27 ( FlowCmdM
28 , getDataText
29 , flowDataText
30 , flow
31
32 , flowCorpusFile
33 , flowCorpus
34 , flowAnnuaire
35 , insertMasterDocs
36
37 , getOrMkRoot
38 , getOrMk_RootWithCorpus
39 , TermType(..)
40 , DataOrigin(..)
41 , allDataOrigins
42
43 , do_api
44 , indexAllDocumentsWithPosTag
45 )
46 where
47
48 import Control.Lens ((^.), view, _Just, makeLenses)
49 import Data.Aeson.TH (deriveJSON)
50 import Data.Either
51 import Data.HashMap.Strict (HashMap)
52 import Data.Hashable (Hashable)
53 import Data.List (concat)
54 import Data.Map (Map, lookup)
55 import Data.Maybe (catMaybes)
56 import Data.Monoid
57 import Data.Swagger
58 import Data.Text (splitOn)
59 import Data.Traversable (traverse)
60 import Data.Tuple.Extra (first, second)
61 import GHC.Generics (Generic)
62 import System.FilePath (FilePath)
63 import qualified Data.HashMap.Strict as HashMap
64 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
65 import qualified Data.Map as Map
66
67 import Gargantext.Core (Lang(..), PosTagAlgo(..))
68 import Gargantext.Core.Ext.IMT (toSchoolName)
69 import Gargantext.Core.Ext.IMTUser (readFile_Annuaire)
70 import Gargantext.Core.Flow.Types
71 import Gargantext.Core.Text
72 import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
73 import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat)
74 import Gargantext.Core.Text.List (buildNgramsLists)
75 import Gargantext.Core.Text.Terms
76 import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
77 import Gargantext.Core.Types (POS(NP))
78 import Gargantext.Core.Types.Individu (User(..))
79 import Gargantext.Core.Types.Main
80 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
81 import Gargantext.Database.Action.Flow.List
82 import Gargantext.Database.Action.Flow.Types
83 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams, DocumentIdWithNgrams(..))
84 import Gargantext.Database.Action.Search (searchDocInDatabase)
85 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
86 import Gargantext.Database.Admin.Types.Hyperdata
87 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
88 import Gargantext.Database.Prelude
89 import Gargantext.Database.Query.Table.Ngrams
90 import Gargantext.Database.Query.Table.Node
91 import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
92 import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
93 import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
94 import Gargantext.Database.Query.Table.NodeNodeNgrams2
95 import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
96 import Gargantext.Database.Schema.Node (NodePoly(..), node_id)
97 import Gargantext.Database.Types
98 import Gargantext.Prelude
99 import Gargantext.Prelude.Crypto.Hash (Hash)
100 import qualified Gargantext.Core.Text.Corpus.API as API
101 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
102
103 ------------------------------------------------------------------------
104 -- Impots for upgrade function
105 import Gargantext.Database.Query.Tree.Root (getRootId)
106 import Gargantext.Database.Query.Tree (findNodesId)
107 import qualified Data.List as List
108 ------------------------------------------------------------------------
109 -- TODO use internal with API name (could be old data)
110 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
111 | ExternalOrigin { _do_api :: API.ExternalAPIs }
112 -- TODO Web
113 deriving (Generic, Eq)
114
115 makeLenses ''DataOrigin
116 deriveJSON (unPrefix "_do_") ''DataOrigin
117 instance ToSchema DataOrigin where
118 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
119
120 allDataOrigins :: [DataOrigin]
121 allDataOrigins = map InternalOrigin API.externalAPIs
122 <> map ExternalOrigin API.externalAPIs
123
124 ---------------
125 data DataText = DataOld ![NodeId]
126 | DataNew ![[HyperdataDocument]]
127
128 -- TODO use the split parameter in config file
129 getDataText :: FlowCmdM env err m
130 => DataOrigin
131 -> TermType Lang
132 -> API.Query
133 -> Maybe API.Limit
134 -> m DataText
135 getDataText (ExternalOrigin api) la q li = liftBase $ DataNew
136 <$> splitEvery 500
137 <$> API.get api (_tt_lang la) q li
138
139 getDataText (InternalOrigin _) _la q _li = do
140 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
141 (UserName userMaster)
142 (Left "")
143 (Nothing :: Maybe HyperdataCorpus)
144 ids <- map fst <$> searchDocInDatabase cId (stemIt q)
145 pure $ DataOld ids
146
147 -------------------------------------------------------------------------------
148 flowDataText :: ( FlowCmdM env err m
149 )
150 => User
151 -> DataText
152 -> TermType Lang
153 -> CorpusId
154 -> m CorpusId
155 flowDataText u (DataOld ids) tt cid = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids
156 where
157 corpusType = (Nothing :: Maybe HyperdataCorpus)
158 flowDataText u (DataNew txt) tt cid = flowCorpus u (Right [cid]) tt txt
159
160 ------------------------------------------------------------------------
161 -- TODO use proxy
162 flowAnnuaire :: (FlowCmdM env err m)
163 => User
164 -> Either CorpusName [CorpusId]
165 -> (TermType Lang)
166 -> FilePath
167 -> m AnnuaireId
168 flowAnnuaire u n l filePath = do
169 docs <- liftBase $ (( splitEvery 500 <$> readFile_Annuaire filePath) :: IO [[HyperdataContact]])
170 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
171
172 ------------------------------------------------------------------------
173 flowCorpusFile :: (FlowCmdM env err m)
174 => User
175 -> Either CorpusName [CorpusId]
176 -> Limit -- Limit the number of docs (for dev purpose)
177 -> TermType Lang -> FileFormat -> FilePath
178 -> m CorpusId
179 flowCorpusFile u n l la ff fp = do
180 docs <- liftBase ( splitEvery 500
181 <$> take l
182 <$> parseFile ff fp
183 )
184 flowCorpus u n la (map (map toHyperdataDocument) docs)
185
186 ------------------------------------------------------------------------
187 -- | TODO improve the needed type to create/update a corpus
188 -- (For now, Either is enough)
189 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
190 => User
191 -> Either CorpusName [CorpusId]
192 -> TermType Lang
193 -> [[a]]
194 -> m CorpusId
195 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
196
197
198 flow :: ( FlowCmdM env err m
199 , FlowCorpus a
200 , MkCorpus c
201 )
202 => Maybe c
203 -> User
204 -> Either CorpusName [CorpusId]
205 -> TermType Lang
206 -> [[a]]
207 -> m CorpusId
208 flow c u cn la docs = do
209 -- TODO if public insertMasterDocs else insertUserDocs
210 ids <- traverse (insertMasterDocs c la) docs
211 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
212
213 ------------------------------------------------------------------------
214 flowCorpusUser :: ( FlowCmdM env err m
215 , MkCorpus c
216 )
217 => Lang
218 -> User
219 -> Either CorpusName [CorpusId]
220 -> Maybe c
221 -> [NodeId]
222 -> m CorpusId
223 flowCorpusUser l user corpusName ctype ids = do
224 -- User Flow
225 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
226 -- NodeTexts is first
227 _tId <- insertDefaultNode NodeTexts userCorpusId userId
228 -- printDebug "NodeTexts: " tId
229
230 -- NodeList is second
231 listId <- getOrMkList userCorpusId userId
232 -- _cooc <- insertDefaultNode NodeListCooc listId userId
233 -- TODO: check if present already, ignore
234 _ <- Doc.add userCorpusId ids
235
236 -- printDebug "Node Text Ids:" tId
237
238 -- User List Flow
239 (masterUserId, _masterRootId, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
240
241 --let gp = (GroupParams l 2 3 (StopSize 3))
242 let gp = GroupWithPosTag l CoreNLP HashMap.empty
243 ngs <- buildNgramsLists gp user userCorpusId masterCorpusId
244
245 _userListId <- flowList_DbRepo listId ngs
246 _mastListId <- getOrMkList masterCorpusId masterUserId
247 -- _ <- insertOccsUpdates userCorpusId mastListId
248 -- printDebug "userListId" userListId
249 -- User Graph Flow
250 _ <- insertDefaultNode NodeDashboard userCorpusId userId
251 _ <- insertDefaultNode NodeGraph userCorpusId userId
252 --_ <- mkPhylo userCorpusId userId
253 -- Annuaire Flow
254 -- _ <- mkAnnuaire rootUserId userId
255 pure userCorpusId
256
257
258 insertMasterDocs :: ( FlowCmdM env err m
259 , FlowCorpus a
260 , MkCorpus c
261 )
262 => Maybe c
263 -> TermType Lang
264 -> [a]
265 -> m [DocId]
266 insertMasterDocs c lang hs = do
267 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
268 (ids', documentsWithId) <- insertDocs masterUserId masterCorpusId (map (toNode masterUserId masterCorpusId) hs )
269 _ <- Doc.add masterCorpusId ids'
270 -- TODO
271 -- create a corpus with database name (CSV or PubMed)
272 -- add documents to the corpus (create node_node link)
273 -- this will enable global database monitoring
274
275 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
276 mapNgramsDocs' :: HashMap ExtractedNgrams (Map NgramsType (Map NodeId Int))
277 <- mapNodeIdNgrams
278 <$> documentIdWithNgrams
279 (extractNgramsT $ withLang lang documentsWithId)
280 documentsWithId
281
282 terms2id <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
283 let mapNgramsDocs = HashMap.mapKeys extracted2ngrams mapNgramsDocs'
284
285 -- to be removed
286 let indexedNgrams = HashMap.mapKeys (indexNgrams terms2id) mapNgramsDocs
287
288 -- new
289 lId <- getOrMkList masterCorpusId masterUserId
290 mapCgramsId <- listInsertDb lId toNodeNgramsW'
291 $ map (first _ngramsTerms . second Map.keys)
292 $ HashMap.toList mapNgramsDocs
293 -- insertDocNgrams
294 _return <- insertNodeNodeNgrams2
295 $ catMaybes [ NodeNodeNgrams2 <$> Just nId
296 <*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms'')
297 <*> Just (fromIntegral w :: Double)
298 | (terms'', mapNgramsTypes) <- HashMap.toList mapNgramsDocs
299 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
300 , (nId, w) <- Map.toList mapNodeIdWeight
301 ]
302
303 -- _cooc <- insertDefaultNode NodeListCooc lId masterUserId
304 -- to be removed
305 _ <- insertDocNgrams lId indexedNgrams
306 pure ids'
307
308 ------------------------------------------------------------------------
309 -- TODO Type NodeDocumentUnicised
310 insertDocs :: ( FlowCmdM env err m
311 -- , FlowCorpus a
312 , FlowInsertDB a
313 )
314 => UserId
315 -> CorpusId
316 -> [a]
317 -> m ([DocId], [Indexed NodeId a])
318 insertDocs uId cId hs = do
319 let docs = map addUniqId hs
320 newIds <- insertDb uId cId docs
321 -- printDebug "newIds" newIds
322 let
323 newIds' = map reId newIds
324 documentsWithId = mergeData (toInserted newIds) (Map.fromList $ map viewUniqId' docs)
325 _ <- Doc.add cId newIds'
326 pure (newIds', documentsWithId)
327
328
329 ------------------------------------------------------------------------
330 viewUniqId' :: UniqId a
331 => a
332 -> (Hash, a)
333 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
334 where
335 err = panic "[ERROR] Database.Flow.toInsert"
336
337
338 toInserted :: [ReturnId]
339 -> Map Hash ReturnId
340 toInserted =
341 Map.fromList . map (\r -> (reUniqId r, r) )
342 . filter (\r -> reInserted r == True)
343
344 mergeData :: Map Hash ReturnId
345 -> Map Hash a
346 -> [Indexed NodeId a]
347 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
348 where
349 toDocumentWithId (sha,hpd) =
350 Indexed <$> fmap reId (lookup sha rs)
351 <*> Just hpd
352
353 ------------------------------------------------------------------------
354 ------------------------------------------------------------------------
355 ------------------------------------------------------------------------
356 documentIdWithNgrams :: HasNodeError err
357 => (a
358 -> Cmd err (HashMap b (Map NgramsType Int)))
359 -> [Indexed NodeId a]
360 -> Cmd err [DocumentIdWithNgrams a b]
361 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
362 where
363 toDocumentIdWithNgrams d = do
364 e <- f $ _unIndex d
365 pure $ DocumentIdWithNgrams d e
366
367
368 -- | TODO check optimization
369 mapNodeIdNgrams :: (Ord b, Hashable b)
370 => [DocumentIdWithNgrams a b]
371 -> HashMap b
372 (Map NgramsType
373 (Map NodeId Int)
374 )
375 mapNodeIdNgrams = HashMap.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
376 where
377 f :: DocumentIdWithNgrams a b
378 -> HashMap b (Map NgramsType (Map NodeId Int))
379 f d = fmap (fmap (Map.singleton nId)) $ documentNgrams d
380 where
381 nId = _index $ documentWithId d
382
383
384 ------------------------------------------------------------------------
385 instance ExtractNgramsT HyperdataContact
386 where
387 extractNgramsT l hc = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extract l hc
388 where
389 extract :: TermType Lang -> HyperdataContact
390 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
391 extract _l hc' = do
392 let authors = map text2ngrams
393 $ maybe ["Nothing"] (\a -> [a])
394 $ view (hc_who . _Just . cw_lastName) hc'
395
396 pure $ HashMap.fromList $ [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ]
397
398
399 instance ExtractNgramsT HyperdataDocument
400 where
401 extractNgramsT :: TermType Lang
402 -> HyperdataDocument
403 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
404 extractNgramsT lang hd = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extractNgramsT' lang hd
405 where
406 extractNgramsT' :: TermType Lang
407 -> HyperdataDocument
408 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
409 extractNgramsT' lang' doc = do
410 let source = text2ngrams
411 $ maybe "Nothing" identity
412 $ _hd_source doc
413
414 institutes = map text2ngrams
415 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
416 $ _hd_institutes doc
417
418 authors = map text2ngrams
419 $ maybe ["Nothing"] (splitOn ", ")
420 $ _hd_authors doc
421
422 terms' <- map (enrichedTerms (lang' ^. tt_lang) CoreNLP NP)
423 <$> concat
424 <$> liftBase (extractTerms lang' $ hasText doc)
425
426 pure $ HashMap.fromList
427 $ [(SimpleNgrams source, Map.singleton Sources 1) ]
428 <> [(SimpleNgrams i', Map.singleton Institutes 1) | i' <- institutes ]
429 <> [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ]
430 <> [(EnrichedNgrams t', Map.singleton NgramsTerms 1) | t' <- terms' ]
431
432 instance (ExtractNgramsT a, HasText a) => ExtractNgramsT (Node a)
433 where
434 extractNgramsT l (Node _ _ _ _ _ _ _ h) = extractNgramsT l h
435
436 instance HasText a => HasText (Node a)
437 where
438 hasText (Node _ _ _ _ _ _ _ h) = hasText h
439
440
441
442 -- | TODO putelsewhere
443 -- | Upgrade function
444 -- Suppose all documents are English (this is the case actually)
445 indexAllDocumentsWithPosTag :: FlowCmdM env err m => m ()
446 indexAllDocumentsWithPosTag = do
447 rootId <- getRootId (UserName userMaster)
448 corpusIds <- findNodesId rootId [NodeCorpus]
449 docs <- List.concat <$> mapM getDocumentsWithParentId corpusIds
450
451 _ <- mapM extractInsert (splitEvery 1000 docs)
452
453 pure ()
454
455 extractInsert :: FlowCmdM env err m => [Node HyperdataDocument] -> m ()
456 extractInsert docs = do
457 let documentsWithId = map (\doc -> Indexed (doc ^. node_id) doc) docs
458
459 mapNgramsDocs' <- mapNodeIdNgrams
460 <$> documentIdWithNgrams
461 (extractNgramsT $ withLang (Multi EN) documentsWithId)
462 documentsWithId
463
464 _ <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
465
466 pure ()
467
468
469
470
471
472
473
474