]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Flow.hs
[TYPES] group type for lems
[gargantext.git] / src / Gargantext / Database / Action / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -- TODO-ACCESS:
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
13
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
16 -}
17
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
19
20 {-# LANGUAGE ConstrainedClassMethods #-}
21 {-# LANGUAGE ConstraintKinds #-}
22 {-# LANGUAGE InstanceSigs #-}
23 {-# LANGUAGE ScopedTypeVariables #-}
24 {-# LANGUAGE TemplateHaskell #-}
25
26 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
27 ( FlowCmdM
28 , getDataText
29 , flowDataText
30 , flow
31
32 , flowCorpusFile
33 , flowCorpus
34 , flowAnnuaire
35 , insertMasterDocs
36
37 , getOrMkRoot
38 , getOrMk_RootWithCorpus
39 , TermType(..)
40 , DataOrigin(..)
41 , allDataOrigins
42
43 , do_api
44 )
45 where
46
47 import Control.Lens ((^.), view, _Just, makeLenses)
48 import Data.Aeson.TH (deriveJSON)
49 import Data.Either
50 import Data.HashMap.Strict (HashMap)
51 import Data.Hashable (Hashable)
52 import Data.List (concat)
53 import Data.Map (Map, lookup)
54 import Data.Maybe (catMaybes)
55 import Data.Monoid
56 import Data.Swagger
57 import Data.Text (splitOn)
58 import Data.Traversable (traverse)
59 import Data.Tuple.Extra (first, second)
60 import GHC.Generics (Generic)
61 import System.FilePath (FilePath)
62 import qualified Data.HashMap.Strict as HashMap
63 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
64 import qualified Data.Map as Map
65
66 import Gargantext.Core (Lang(..), PosTagAlgo(..))
67 import Gargantext.Core.Ext.IMT (toSchoolName)
68 import Gargantext.Core.Ext.IMTUser (deserialiseImtUsersFromFile)
69 import Gargantext.Core.Flow.Types
70 import Gargantext.Core.Text
71 import Gargantext.Core.Text.List.Group.WithStem (StopSize(..), GroupParams(..))
72 import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat)
73 import Gargantext.Core.Text.List (buildNgramsLists)
74 import Gargantext.Core.Text.Terms
75 import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
76 import Gargantext.Core.Types (POS(NP))
77 import Gargantext.Core.Types.Individu (User(..))
78 import Gargantext.Core.Types.Main
79 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
80 import Gargantext.Database.Action.Flow.List
81 import Gargantext.Database.Action.Flow.Types
82 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams, DocumentIdWithNgrams(..))
83 import Gargantext.Database.Action.Search (searchDocInDatabase)
84 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
85 import Gargantext.Database.Admin.Types.Hyperdata
86 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
87 import Gargantext.Database.Prelude
88 import Gargantext.Database.Query.Table.Ngrams
89 import Gargantext.Database.Query.Table.Node
90 import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
91 import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
92 import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
93 import Gargantext.Database.Query.Table.NodeNodeNgrams2
94 import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
95 import Gargantext.Database.Schema.Node (NodePoly(..))
96 import Gargantext.Database.Types
97 import Gargantext.Prelude
98 import Gargantext.Prelude.Crypto.Hash (Hash)
99 import qualified Gargantext.Core.Text.Corpus.API as API
100 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
101
102 ------------------------------------------------------------------------
103 -- TODO use internal with API name (could be old data)
104 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
105 | ExternalOrigin { _do_api :: API.ExternalAPIs }
106 -- TODO Web
107 deriving (Generic, Eq)
108
109 makeLenses ''DataOrigin
110 deriveJSON (unPrefix "_do_") ''DataOrigin
111 instance ToSchema DataOrigin where
112 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
113
114 allDataOrigins :: [DataOrigin]
115 allDataOrigins = map InternalOrigin API.externalAPIs
116 <> map ExternalOrigin API.externalAPIs
117
118 ---------------
119 data DataText = DataOld ![NodeId]
120 | DataNew ![[HyperdataDocument]]
121
122 -- TODO use the split parameter in config file
123 getDataText :: FlowCmdM env err m
124 => DataOrigin
125 -> TermType Lang
126 -> API.Query
127 -> Maybe API.Limit
128 -> m DataText
129 getDataText (ExternalOrigin api) la q li = liftBase $ DataNew
130 <$> splitEvery 500
131 <$> API.get api (_tt_lang la) q li
132
133 getDataText (InternalOrigin _) _la q _li = do
134 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
135 (UserName userMaster)
136 (Left "")
137 (Nothing :: Maybe HyperdataCorpus)
138 ids <- map fst <$> searchDocInDatabase cId (stemIt q)
139 pure $ DataOld ids
140
141 -------------------------------------------------------------------------------
142 flowDataText :: ( FlowCmdM env err m
143 )
144 => User
145 -> DataText
146 -> TermType Lang
147 -> CorpusId
148 -> m CorpusId
149 flowDataText u (DataOld ids) tt cid = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids
150 where
151 corpusType = (Nothing :: Maybe HyperdataCorpus)
152 flowDataText u (DataNew txt) tt cid = flowCorpus u (Right [cid]) tt txt
153
154 ------------------------------------------------------------------------
155 -- TODO use proxy
156 flowAnnuaire :: (FlowCmdM env err m)
157 => User
158 -> Either CorpusName [CorpusId]
159 -> (TermType Lang)
160 -> FilePath
161 -> m AnnuaireId
162 flowAnnuaire u n l filePath = do
163 docs <- liftBase $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
164 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
165
166 ------------------------------------------------------------------------
167 flowCorpusFile :: (FlowCmdM env err m)
168 => User
169 -> Either CorpusName [CorpusId]
170 -> Limit -- Limit the number of docs (for dev purpose)
171 -> TermType Lang -> FileFormat -> FilePath
172 -> m CorpusId
173 flowCorpusFile u n l la ff fp = do
174 docs <- liftBase ( splitEvery 500
175 <$> take l
176 <$> parseFile ff fp
177 )
178 flowCorpus u n la (map (map toHyperdataDocument) docs)
179
180 ------------------------------------------------------------------------
181 -- | TODO improve the needed type to create/update a corpus
182 -- (For now, Either is enough)
183 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
184 => User
185 -> Either CorpusName [CorpusId]
186 -> TermType Lang
187 -> [[a]]
188 -> m CorpusId
189 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
190
191
192 flow :: ( FlowCmdM env err m
193 , FlowCorpus a
194 , MkCorpus c
195 )
196 => Maybe c
197 -> User
198 -> Either CorpusName [CorpusId]
199 -> TermType Lang
200 -> [[a]]
201 -> m CorpusId
202 flow c u cn la docs = do
203 -- TODO if public insertMasterDocs else insertUserDocs
204 ids <- traverse (insertMasterDocs c la) docs
205 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
206
207 ------------------------------------------------------------------------
208 flowCorpusUser :: ( FlowCmdM env err m
209 , MkCorpus c
210 )
211 => Lang
212 -> User
213 -> Either CorpusName [CorpusId]
214 -> Maybe c
215 -> [NodeId]
216 -> m CorpusId
217 flowCorpusUser l user corpusName ctype ids = do
218 -- User Flow
219 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
220 -- NodeTexts is first
221 _tId <- insertDefaultNode NodeTexts userCorpusId userId
222 -- printDebug "NodeTexts: " tId
223
224 -- NodeList is second
225 listId <- getOrMkList userCorpusId userId
226 -- _cooc <- insertDefaultNode NodeListCooc listId userId
227 -- TODO: check if present already, ignore
228 _ <- Doc.add userCorpusId ids
229
230 -- printDebug "Node Text Ids:" tId
231
232 -- User List Flow
233 (masterUserId, _masterRootId, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
234 ngs <- buildNgramsLists user (GroupParams l 2 3 (StopSize 3)) userCorpusId masterCorpusId
235 _userListId <- flowList_DbRepo listId ngs
236 _mastListId <- getOrMkList masterCorpusId masterUserId
237 -- _ <- insertOccsUpdates userCorpusId mastListId
238 -- printDebug "userListId" userListId
239 -- User Graph Flow
240 _ <- insertDefaultNode NodeDashboard userCorpusId userId
241 _ <- insertDefaultNode NodeGraph userCorpusId userId
242 --_ <- mkPhylo userCorpusId userId
243 -- Annuaire Flow
244 -- _ <- mkAnnuaire rootUserId userId
245 pure userCorpusId
246
247
248 insertMasterDocs :: ( FlowCmdM env err m
249 , FlowCorpus a
250 , MkCorpus c
251 )
252 => Maybe c
253 -> TermType Lang
254 -> [a]
255 -> m [DocId]
256 insertMasterDocs c lang hs = do
257 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
258 (ids', documentsWithId) <- insertDocs masterUserId masterCorpusId (map (toNode masterUserId masterCorpusId) hs )
259 _ <- Doc.add masterCorpusId ids'
260 -- TODO
261 -- create a corpus with database name (CSV or PubMed)
262 -- add documents to the corpus (create node_node link)
263 -- this will enable global database monitoring
264
265 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
266 mapNgramsDocs' :: HashMap ExtractedNgrams (Map NgramsType (Map NodeId Int))
267 <- mapNodeIdNgrams
268 <$> documentIdWithNgrams
269 (extractNgramsT $ withLang lang documentsWithId)
270 documentsWithId
271
272 terms2id <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
273 let mapNgramsDocs = HashMap.mapKeys extracted2ngrams mapNgramsDocs'
274
275 -- to be removed
276 let indexedNgrams = HashMap.mapKeys (indexNgrams terms2id) mapNgramsDocs
277
278 -- new
279 lId <- getOrMkList masterCorpusId masterUserId
280 mapCgramsId <- listInsertDb lId toNodeNgramsW'
281 $ map (first _ngramsTerms . second Map.keys)
282 $ HashMap.toList mapNgramsDocs
283 -- insertDocNgrams
284 _return <- insertNodeNodeNgrams2
285 $ catMaybes [ NodeNodeNgrams2 <$> Just nId
286 <*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms'')
287 <*> Just (fromIntegral w :: Double)
288 | (terms'', mapNgramsTypes) <- HashMap.toList mapNgramsDocs
289 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
290 , (nId, w) <- Map.toList mapNodeIdWeight
291 ]
292
293 -- _cooc <- insertDefaultNode NodeListCooc lId masterUserId
294 -- to be removed
295 _ <- insertDocNgrams lId indexedNgrams
296 pure ids'
297
298 ------------------------------------------------------------------------
299 -- TODO Type NodeDocumentUnicised
300 insertDocs :: ( FlowCmdM env err m
301 -- , FlowCorpus a
302 , FlowInsertDB a
303 )
304 => UserId
305 -> CorpusId
306 -> [a]
307 -> m ([DocId], [Indexed NodeId a])
308 insertDocs uId cId hs = do
309 let docs = map addUniqId hs
310 newIds <- insertDb uId cId docs
311 -- printDebug "newIds" newIds
312 let
313 newIds' = map reId newIds
314 documentsWithId = mergeData (toInserted newIds) (Map.fromList $ map viewUniqId' docs)
315 _ <- Doc.add cId newIds'
316 pure (newIds', documentsWithId)
317
318
319 ------------------------------------------------------------------------
320 viewUniqId' :: UniqId a
321 => a
322 -> (Hash, a)
323 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
324 where
325 err = panic "[ERROR] Database.Flow.toInsert"
326
327
328 toInserted :: [ReturnId]
329 -> Map Hash ReturnId
330 toInserted =
331 Map.fromList . map (\r -> (reUniqId r, r) )
332 . filter (\r -> reInserted r == True)
333
334 mergeData :: Map Hash ReturnId
335 -> Map Hash a
336 -> [Indexed NodeId a]
337 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
338 where
339 toDocumentWithId (sha,hpd) =
340 Indexed <$> fmap reId (lookup sha rs)
341 <*> Just hpd
342
343 ------------------------------------------------------------------------
344 ------------------------------------------------------------------------
345 ------------------------------------------------------------------------
346 documentIdWithNgrams :: HasNodeError err
347 => (a
348 -> Cmd err (HashMap b (Map NgramsType Int)))
349 -> [Indexed NodeId a]
350 -> Cmd err [DocumentIdWithNgrams a b]
351 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
352 where
353 toDocumentIdWithNgrams d = do
354 e <- f $ _unIndex d
355 pure $ DocumentIdWithNgrams d e
356
357
358 -- | TODO check optimization
359 mapNodeIdNgrams :: (Ord b, Hashable b)
360 => [DocumentIdWithNgrams a b]
361 -> HashMap b
362 (Map NgramsType
363 (Map NodeId Int)
364 )
365 mapNodeIdNgrams = HashMap.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
366 where
367 f :: DocumentIdWithNgrams a b
368 -> HashMap b (Map NgramsType (Map NodeId Int))
369 f d = fmap (fmap (Map.singleton nId)) $ documentNgrams d
370 where
371 nId = _index $ documentWithId d
372
373
374 ------------------------------------------------------------------------
375 instance ExtractNgramsT HyperdataContact
376 where
377 extractNgramsT l hc = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extract l hc
378 where
379 extract :: TermType Lang -> HyperdataContact
380 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
381 extract _l hc' = do
382 let authors = map text2ngrams
383 $ maybe ["Nothing"] (\a -> [a])
384 $ view (hc_who . _Just . cw_lastName) hc'
385
386 pure $ HashMap.fromList $ [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ]
387
388
389 instance ExtractNgramsT HyperdataDocument
390 where
391 extractNgramsT :: TermType Lang
392 -> HyperdataDocument
393 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
394 extractNgramsT lang hd = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extractNgramsT' lang hd
395 where
396 extractNgramsT' :: TermType Lang
397 -> HyperdataDocument
398 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
399 extractNgramsT' lang' doc = do
400 let source = text2ngrams
401 $ maybe "Nothing" identity
402 $ _hd_source doc
403
404 institutes = map text2ngrams
405 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
406 $ _hd_institutes doc
407
408 authors = map text2ngrams
409 $ maybe ["Nothing"] (splitOn ", ")
410 $ _hd_authors doc
411
412 terms' <- map (enrichedTerms (lang' ^. tt_lang) CoreNLP NP)
413 <$> concat
414 <$> liftBase (extractTerms lang' $ hasText doc)
415
416 pure $ HashMap.fromList
417 $ [(SimpleNgrams source, Map.singleton Sources 1) ]
418 <> [(SimpleNgrams i', Map.singleton Institutes 1) | i' <- institutes ]
419 <> [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ]
420 <> [(EnrichedNgrams t', Map.singleton NgramsTerms 1) | t' <- terms' ]
421
422 instance (ExtractNgramsT a, HasText a) => ExtractNgramsT (Node a)
423 where
424 extractNgramsT l (Node _ _ _ _ _ _ _ h) = extractNgramsT l h
425
426 instance HasText a => HasText (Node a)
427 where
428 hasText (Node _ _ _ _ _ _ _ h) = hasText h
429
430