]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Flow.hs
[FEAT] Indexation function to test
[gargantext.git] / src / Gargantext / Database / Action / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -- TODO-ACCESS:
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
13
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
16 -}
17
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
19
20 {-# LANGUAGE ConstrainedClassMethods #-}
21 {-# LANGUAGE ConstraintKinds #-}
22 {-# LANGUAGE InstanceSigs #-}
23 {-# LANGUAGE ScopedTypeVariables #-}
24 {-# LANGUAGE TemplateHaskell #-}
25
26 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
27 ( FlowCmdM
28 , getDataText
29 , flowDataText
30 , flow
31
32 , flowCorpusFile
33 , flowCorpus
34 , flowAnnuaire
35 , insertMasterDocs
36
37 , getOrMkRoot
38 , getOrMk_RootWithCorpus
39 , TermType(..)
40 , DataOrigin(..)
41 , allDataOrigins
42
43 , do_api
44 , indexAllDocumentsWithPosTag
45 )
46 where
47
48 import Control.Lens ((^.), view, _Just, makeLenses)
49 import Data.Aeson.TH (deriveJSON)
50 import Data.Either
51 import Data.HashMap.Strict (HashMap)
52 import Data.Hashable (Hashable)
53 import Data.List (concat)
54 import Data.Map (Map, lookup)
55 import Data.Maybe (catMaybes)
56 import Data.Monoid
57 import Data.Swagger
58 import Data.Text (splitOn)
59 import Data.Traversable (traverse)
60 import Data.Tuple.Extra (first, second)
61 import GHC.Generics (Generic)
62 import System.FilePath (FilePath)
63 import qualified Data.HashMap.Strict as HashMap
64 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
65 import qualified Data.Map as Map
66
67 import Gargantext.Core (Lang(..), PosTagAlgo(..))
68 import Gargantext.Core.Ext.IMT (toSchoolName)
69 import Gargantext.Core.Ext.IMTUser (readFile_Annuaire)
70 import Gargantext.Core.Flow.Types
71 import Gargantext.Core.Text
72 import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
73 import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat)
74 import Gargantext.Core.Text.List (buildNgramsLists)
75 import Gargantext.Core.Text.Terms
76 import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
77 import Gargantext.Core.Types (POS(NP))
78 import Gargantext.Core.Types.Individu (User(..))
79 import Gargantext.Core.Types.Main
80 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
81 import Gargantext.Database.Action.Flow.List
82 import Gargantext.Database.Action.Flow.Types
83 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams, DocumentIdWithNgrams(..))
84 import Gargantext.Database.Action.Search (searchDocInDatabase)
85 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
86 import Gargantext.Database.Admin.Types.Hyperdata
87 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
88 import Gargantext.Database.Prelude
89 import Gargantext.Database.Query.Table.Ngrams
90 import Gargantext.Database.Query.Table.Node
91 import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
92 import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
93 import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
94 import Gargantext.Database.Query.Table.NodeNodeNgrams2
95 import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
96 import Gargantext.Database.Schema.Node (NodePoly(..), node_id)
97 import Gargantext.Database.Types
98 import Gargantext.Prelude
99 import Gargantext.Prelude.Crypto.Hash (Hash)
100 import qualified Gargantext.Core.Text.Corpus.API as API
101 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
102
103 ------------------------------------------------------------------------
104 -- Impots for upgrade function
105 import Gargantext.Database.Query.Tree.Root (getRootId)
106 import Gargantext.Database.Query.Tree (findNodesId)
107 import qualified Data.List as List
108 ------------------------------------------------------------------------
109 -- TODO use internal with API name (could be old data)
110 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
111 | ExternalOrigin { _do_api :: API.ExternalAPIs }
112 -- TODO Web
113 deriving (Generic, Eq)
114
115 makeLenses ''DataOrigin
116 deriveJSON (unPrefix "_do_") ''DataOrigin
117 instance ToSchema DataOrigin where
118 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
119
120 allDataOrigins :: [DataOrigin]
121 allDataOrigins = map InternalOrigin API.externalAPIs
122 <> map ExternalOrigin API.externalAPIs
123
124 ---------------
125 data DataText = DataOld ![NodeId]
126 | DataNew ![[HyperdataDocument]]
127
128 -- TODO use the split parameter in config file
129 getDataText :: FlowCmdM env err m
130 => DataOrigin
131 -> TermType Lang
132 -> API.Query
133 -> Maybe API.Limit
134 -> m DataText
135 getDataText (ExternalOrigin api) la q li = liftBase $ DataNew
136 <$> splitEvery 500
137 <$> API.get api (_tt_lang la) q li
138
139 getDataText (InternalOrigin _) _la q _li = do
140 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
141 (UserName userMaster)
142 (Left "")
143 (Nothing :: Maybe HyperdataCorpus)
144 ids <- map fst <$> searchDocInDatabase cId (stemIt q)
145 pure $ DataOld ids
146
147 -------------------------------------------------------------------------------
148 flowDataText :: ( FlowCmdM env err m
149 )
150 => User
151 -> DataText
152 -> TermType Lang
153 -> CorpusId
154 -> m CorpusId
155 flowDataText u (DataOld ids) tt cid = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids
156 where
157 corpusType = (Nothing :: Maybe HyperdataCorpus)
158 flowDataText u (DataNew txt) tt cid = flowCorpus u (Right [cid]) tt txt
159
160 ------------------------------------------------------------------------
161 -- TODO use proxy
162 flowAnnuaire :: (FlowCmdM env err m)
163 => User
164 -> Either CorpusName [CorpusId]
165 -> (TermType Lang)
166 -> FilePath
167 -> m AnnuaireId
168 flowAnnuaire u n l filePath = do
169 docs <- liftBase $ (( splitEvery 500 <$> readFile_Annuaire filePath) :: IO [[HyperdataContact]])
170 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
171
172 ------------------------------------------------------------------------
173 flowCorpusFile :: (FlowCmdM env err m)
174 => User
175 -> Either CorpusName [CorpusId]
176 -> Limit -- Limit the number of docs (for dev purpose)
177 -> TermType Lang -> FileFormat -> FilePath
178 -> m CorpusId
179 flowCorpusFile u n l la ff fp = do
180 docs <- liftBase ( splitEvery 500
181 <$> take l
182 <$> parseFile ff fp
183 )
184 flowCorpus u n la (map (map toHyperdataDocument) docs)
185
186 ------------------------------------------------------------------------
187 -- | TODO improve the needed type to create/update a corpus
188 -- (For now, Either is enough)
189 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
190 => User
191 -> Either CorpusName [CorpusId]
192 -> TermType Lang
193 -> [[a]]
194 -> m CorpusId
195 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
196
197
198 flow :: ( FlowCmdM env err m
199 , FlowCorpus a
200 , MkCorpus c
201 )
202 => Maybe c
203 -> User
204 -> Either CorpusName [CorpusId]
205 -> TermType Lang
206 -> [[a]]
207 -> m CorpusId
208 flow c u cn la docs = do
209 -- TODO if public insertMasterDocs else insertUserDocs
210 ids <- traverse (insertMasterDocs c la) docs
211 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
212
213 ------------------------------------------------------------------------
214 flowCorpusUser :: ( FlowCmdM env err m
215 , MkCorpus c
216 )
217 => Lang
218 -> User
219 -> Either CorpusName [CorpusId]
220 -> Maybe c
221 -> [NodeId]
222 -> m CorpusId
223 flowCorpusUser l user corpusName ctype ids = do
224 -- User Flow
225 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
226 -- NodeTexts is first
227 _tId <- insertDefaultNode NodeTexts userCorpusId userId
228 -- printDebug "NodeTexts: " tId
229
230 -- NodeList is second
231 listId <- getOrMkList userCorpusId userId
232 -- _cooc <- insertDefaultNode NodeListCooc listId userId
233 -- TODO: check if present already, ignore
234 _ <- Doc.add userCorpusId ids
235
236 -- printDebug "Node Text Ids:" tId
237
238 -- User List Flow
239 (masterUserId, _masterRootId, masterCorpusId)
240 <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
241
242 --let gp = (GroupParams l 2 3 (StopSize 3))
243 let gp = GroupWithPosTag l CoreNLP HashMap.empty
244 ngs <- buildNgramsLists gp user userCorpusId masterCorpusId
245
246 _userListId <- flowList_DbRepo listId ngs
247 _mastListId <- getOrMkList masterCorpusId masterUserId
248 -- _ <- insertOccsUpdates userCorpusId mastListId
249 -- printDebug "userListId" userListId
250 -- User Graph Flow
251 _ <- insertDefaultNode NodeDashboard userCorpusId userId
252 _ <- insertDefaultNode NodeGraph userCorpusId userId
253 --_ <- mkPhylo userCorpusId userId
254 -- Annuaire Flow
255 -- _ <- mkAnnuaire rootUserId userId
256 pure userCorpusId
257
258
259 insertMasterDocs :: ( FlowCmdM env err m
260 , FlowCorpus a
261 , MkCorpus c
262 )
263 => Maybe c
264 -> TermType Lang
265 -> [a]
266 -> m [DocId]
267 insertMasterDocs c lang hs = do
268 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
269 (ids', documentsWithId) <- insertDocs masterUserId masterCorpusId (map (toNode masterUserId masterCorpusId) hs )
270 _ <- Doc.add masterCorpusId ids'
271 -- TODO
272 -- create a corpus with database name (CSV or PubMed)
273 -- add documents to the corpus (create node_node link)
274 -- this will enable global database monitoring
275
276 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
277 mapNgramsDocs' :: HashMap ExtractedNgrams (Map NgramsType (Map NodeId Int))
278 <- mapNodeIdNgrams
279 <$> documentIdWithNgrams
280 (extractNgramsT $ withLang lang documentsWithId)
281 documentsWithId
282
283 terms2id <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
284 let mapNgramsDocs = HashMap.mapKeys extracted2ngrams mapNgramsDocs'
285
286 -- to be removed
287 let indexedNgrams = HashMap.mapKeys (indexNgrams terms2id) mapNgramsDocs
288
289 -- new
290 lId <- getOrMkList masterCorpusId masterUserId
291 mapCgramsId <- listInsertDb lId toNodeNgramsW'
292 $ map (first _ngramsTerms . second Map.keys)
293 $ HashMap.toList mapNgramsDocs
294 -- insertDocNgrams
295 _return <- insertNodeNodeNgrams2
296 $ catMaybes [ NodeNodeNgrams2 <$> Just nId
297 <*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms'')
298 <*> Just (fromIntegral w :: Double)
299 | (terms'', mapNgramsTypes) <- HashMap.toList mapNgramsDocs
300 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
301 , (nId, w) <- Map.toList mapNodeIdWeight
302 ]
303
304 -- _cooc <- insertDefaultNode NodeListCooc lId masterUserId
305 -- to be removed
306 _ <- insertDocNgrams lId indexedNgrams
307 pure ids'
308
309 ------------------------------------------------------------------------
310 -- TODO Type NodeDocumentUnicised
311 insertDocs :: ( FlowCmdM env err m
312 -- , FlowCorpus a
313 , FlowInsertDB a
314 )
315 => UserId
316 -> CorpusId
317 -> [a]
318 -> m ([DocId], [Indexed NodeId a])
319 insertDocs uId cId hs = do
320 let docs = map addUniqId hs
321 newIds <- insertDb uId cId docs
322 -- printDebug "newIds" newIds
323 let
324 newIds' = map reId newIds
325 documentsWithId = mergeData (toInserted newIds) (Map.fromList $ map viewUniqId' docs)
326 _ <- Doc.add cId newIds'
327 pure (newIds', documentsWithId)
328
329
330 ------------------------------------------------------------------------
331 viewUniqId' :: UniqId a
332 => a
333 -> (Hash, a)
334 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
335 where
336 err = panic "[ERROR] Database.Flow.toInsert"
337
338
339 toInserted :: [ReturnId]
340 -> Map Hash ReturnId
341 toInserted =
342 Map.fromList . map (\r -> (reUniqId r, r) )
343 . filter (\r -> reInserted r == True)
344
345 mergeData :: Map Hash ReturnId
346 -> Map Hash a
347 -> [Indexed NodeId a]
348 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
349 where
350 toDocumentWithId (sha,hpd) =
351 Indexed <$> fmap reId (lookup sha rs)
352 <*> Just hpd
353
354 ------------------------------------------------------------------------
355 ------------------------------------------------------------------------
356 ------------------------------------------------------------------------
357 documentIdWithNgrams :: HasNodeError err
358 => (a
359 -> Cmd err (HashMap b (Map NgramsType Int)))
360 -> [Indexed NodeId a]
361 -> Cmd err [DocumentIdWithNgrams a b]
362 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
363 where
364 toDocumentIdWithNgrams d = do
365 e <- f $ _unIndex d
366 pure $ DocumentIdWithNgrams d e
367
368
369 -- | TODO check optimization
370 mapNodeIdNgrams :: (Ord b, Hashable b)
371 => [DocumentIdWithNgrams a b]
372 -> HashMap b
373 (Map NgramsType
374 (Map NodeId Int)
375 )
376 mapNodeIdNgrams = HashMap.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
377 where
378 f :: DocumentIdWithNgrams a b
379 -> HashMap b (Map NgramsType (Map NodeId Int))
380 f d = fmap (fmap (Map.singleton nId)) $ documentNgrams d
381 where
382 nId = _index $ documentWithId d
383
384
385 ------------------------------------------------------------------------
386 instance ExtractNgramsT HyperdataContact
387 where
388 extractNgramsT l hc = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extract l hc
389 where
390 extract :: TermType Lang -> HyperdataContact
391 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
392 extract _l hc' = do
393 let authors = map text2ngrams
394 $ maybe ["Nothing"] (\a -> [a])
395 $ view (hc_who . _Just . cw_lastName) hc'
396
397 pure $ HashMap.fromList $ [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ]
398
399
400 instance ExtractNgramsT HyperdataDocument
401 where
402 extractNgramsT :: TermType Lang
403 -> HyperdataDocument
404 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
405 extractNgramsT lang hd = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extractNgramsT' lang hd
406 where
407 extractNgramsT' :: TermType Lang
408 -> HyperdataDocument
409 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
410 extractNgramsT' lang' doc = do
411 let source = text2ngrams
412 $ maybe "Nothing" identity
413 $ _hd_source doc
414
415 institutes = map text2ngrams
416 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
417 $ _hd_institutes doc
418
419 authors = map text2ngrams
420 $ maybe ["Nothing"] (splitOn ", ")
421 $ _hd_authors doc
422
423 terms' <- map (enrichedTerms (lang' ^. tt_lang) CoreNLP NP)
424 <$> concat
425 <$> liftBase (extractTerms lang' $ hasText doc)
426
427 pure $ HashMap.fromList
428 $ [(SimpleNgrams source, Map.singleton Sources 1) ]
429 <> [(SimpleNgrams i', Map.singleton Institutes 1) | i' <- institutes ]
430 <> [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ]
431 <> [(EnrichedNgrams t', Map.singleton NgramsTerms 1) | t' <- terms' ]
432
433 instance (ExtractNgramsT a, HasText a) => ExtractNgramsT (Node a)
434 where
435 extractNgramsT l (Node _ _ _ _ _ _ _ h) = extractNgramsT l h
436
437 instance HasText a => HasText (Node a)
438 where
439 hasText (Node _ _ _ _ _ _ _ h) = hasText h
440
441
442
443 -- | TODO putelsewhere
444 -- | Upgrade function
445 -- Suppose all documents are English (this is the case actually)
446 indexAllDocumentsWithPosTag :: FlowCmdM env err m => m ()
447 indexAllDocumentsWithPosTag = do
448 rootId <- getRootId (UserName userMaster)
449 corpusIds <- findNodesId rootId [NodeCorpus]
450 docs <- List.concat <$> mapM getDocumentsWithParentId corpusIds
451
452 _ <- mapM extractInsert (splitEvery 1000 docs)
453
454 pure ()
455
456 extractInsert :: FlowCmdM env err m => [Node HyperdataDocument] -> m ()
457 extractInsert docs = do
458 let documentsWithId = map (\doc -> Indexed (doc ^. node_id) doc) docs
459
460 mapNgramsDocs' <- mapNodeIdNgrams
461 <$> documentIdWithNgrams
462 (extractNgramsT $ withLang (Multi EN) documentsWithId)
463 documentsWithId
464
465 _ <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
466
467 pure ()
468
469
470
471
472
473
474
475