]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Flow.hs
[FEAT] group ngrams, connected (testing now)
[gargantext.git] / src / Gargantext / Database / Action / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -- TODO-ACCESS:
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
13
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
16 -}
17
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
19
20 {-# LANGUAGE ConstrainedClassMethods #-}
21 {-# LANGUAGE ConstraintKinds #-}
22 {-# LANGUAGE InstanceSigs #-}
23 {-# LANGUAGE ScopedTypeVariables #-}
24 {-# LANGUAGE TemplateHaskell #-}
25
26 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
27 ( FlowCmdM
28 , getDataText
29 , flowDataText
30 , flow
31
32 , flowCorpusFile
33 , flowCorpus
34 , flowAnnuaire
35 , insertMasterDocs
36
37 , getOrMkRoot
38 , getOrMk_RootWithCorpus
39 , TermType(..)
40 , DataOrigin(..)
41 , allDataOrigins
42
43 , do_api
44 )
45 where
46
47 import Control.Lens ((^.), view, _Just, makeLenses)
48 import Data.Aeson.TH (deriveJSON)
49 import Data.Either
50 import Data.HashMap.Strict (HashMap)
51 import Data.Hashable (Hashable)
52 import Data.List (concat)
53 import Data.Map (Map, lookup)
54 import Data.Maybe (catMaybes)
55 import Data.Monoid
56 import Data.Swagger
57 import Data.Text (splitOn)
58 import Data.Traversable (traverse)
59 import Data.Tuple.Extra (first, second)
60 import GHC.Generics (Generic)
61 import System.FilePath (FilePath)
62 import qualified Data.HashMap.Strict as HashMap
63 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
64 import qualified Data.Map as Map
65
66 import Gargantext.Core (Lang(..), PosTagAlgo(..))
67 import Gargantext.Core.Ext.IMT (toSchoolName)
68 import Gargantext.Core.Ext.IMTUser (deserialiseImtUsersFromFile)
69 import Gargantext.Core.Flow.Types
70 import Gargantext.Core.Text
71 import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
72 import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat)
73 import Gargantext.Core.Text.List (buildNgramsLists)
74 import Gargantext.Core.Text.Terms
75 import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
76 import Gargantext.Core.Types (POS(NP))
77 import Gargantext.Core.Types.Individu (User(..))
78 import Gargantext.Core.Types.Main
79 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
80 import Gargantext.Database.Action.Flow.List
81 import Gargantext.Database.Action.Flow.Types
82 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams, DocumentIdWithNgrams(..))
83 import Gargantext.Database.Action.Search (searchDocInDatabase)
84 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
85 import Gargantext.Database.Admin.Types.Hyperdata
86 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
87 import Gargantext.Database.Prelude
88 import Gargantext.Database.Query.Table.Ngrams
89 import Gargantext.Database.Query.Table.Node
90 import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
91 import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
92 import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
93 import Gargantext.Database.Query.Table.NodeNodeNgrams2
94 import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
95 import Gargantext.Database.Schema.Node (NodePoly(..))
96 import Gargantext.Database.Types
97 import Gargantext.Prelude
98 import Gargantext.Prelude.Crypto.Hash (Hash)
99 import qualified Gargantext.Core.Text.Corpus.API as API
100 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
101
102 ------------------------------------------------------------------------
103 -- TODO use internal with API name (could be old data)
104 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
105 | ExternalOrigin { _do_api :: API.ExternalAPIs }
106 -- TODO Web
107 deriving (Generic, Eq)
108
109 makeLenses ''DataOrigin
110 deriveJSON (unPrefix "_do_") ''DataOrigin
111 instance ToSchema DataOrigin where
112 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
113
114 allDataOrigins :: [DataOrigin]
115 allDataOrigins = map InternalOrigin API.externalAPIs
116 <> map ExternalOrigin API.externalAPIs
117
118 ---------------
119 data DataText = DataOld ![NodeId]
120 | DataNew ![[HyperdataDocument]]
121
122 -- TODO use the split parameter in config file
123 getDataText :: FlowCmdM env err m
124 => DataOrigin
125 -> TermType Lang
126 -> API.Query
127 -> Maybe API.Limit
128 -> m DataText
129 getDataText (ExternalOrigin api) la q li = liftBase $ DataNew
130 <$> splitEvery 500
131 <$> API.get api (_tt_lang la) q li
132
133 getDataText (InternalOrigin _) _la q _li = do
134 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
135 (UserName userMaster)
136 (Left "")
137 (Nothing :: Maybe HyperdataCorpus)
138 ids <- map fst <$> searchDocInDatabase cId (stemIt q)
139 pure $ DataOld ids
140
141 -------------------------------------------------------------------------------
142 flowDataText :: ( FlowCmdM env err m
143 )
144 => User
145 -> DataText
146 -> TermType Lang
147 -> CorpusId
148 -> m CorpusId
149 flowDataText u (DataOld ids) tt cid = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids
150 where
151 corpusType = (Nothing :: Maybe HyperdataCorpus)
152 flowDataText u (DataNew txt) tt cid = flowCorpus u (Right [cid]) tt txt
153
154 ------------------------------------------------------------------------
155 -- TODO use proxy
156 flowAnnuaire :: (FlowCmdM env err m)
157 => User
158 -> Either CorpusName [CorpusId]
159 -> (TermType Lang)
160 -> FilePath
161 -> m AnnuaireId
162 flowAnnuaire u n l filePath = do
163 docs <- liftBase $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
164 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
165
166 ------------------------------------------------------------------------
167 flowCorpusFile :: (FlowCmdM env err m)
168 => User
169 -> Either CorpusName [CorpusId]
170 -> Limit -- Limit the number of docs (for dev purpose)
171 -> TermType Lang -> FileFormat -> FilePath
172 -> m CorpusId
173 flowCorpusFile u n l la ff fp = do
174 docs <- liftBase ( splitEvery 500
175 <$> take l
176 <$> parseFile ff fp
177 )
178 flowCorpus u n la (map (map toHyperdataDocument) docs)
179
180 ------------------------------------------------------------------------
181 -- | TODO improve the needed type to create/update a corpus
182 -- (For now, Either is enough)
183 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
184 => User
185 -> Either CorpusName [CorpusId]
186 -> TermType Lang
187 -> [[a]]
188 -> m CorpusId
189 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
190
191
192 flow :: ( FlowCmdM env err m
193 , FlowCorpus a
194 , MkCorpus c
195 )
196 => Maybe c
197 -> User
198 -> Either CorpusName [CorpusId]
199 -> TermType Lang
200 -> [[a]]
201 -> m CorpusId
202 flow c u cn la docs = do
203 -- TODO if public insertMasterDocs else insertUserDocs
204 ids <- traverse (insertMasterDocs c la) docs
205 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
206
207 ------------------------------------------------------------------------
208 flowCorpusUser :: ( FlowCmdM env err m
209 , MkCorpus c
210 )
211 => Lang
212 -> User
213 -> Either CorpusName [CorpusId]
214 -> Maybe c
215 -> [NodeId]
216 -> m CorpusId
217 flowCorpusUser l user corpusName ctype ids = do
218 -- User Flow
219 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
220 -- NodeTexts is first
221 _tId <- insertDefaultNode NodeTexts userCorpusId userId
222 -- printDebug "NodeTexts: " tId
223
224 -- NodeList is second
225 listId <- getOrMkList userCorpusId userId
226 -- _cooc <- insertDefaultNode NodeListCooc listId userId
227 -- TODO: check if present already, ignore
228 _ <- Doc.add userCorpusId ids
229
230 -- printDebug "Node Text Ids:" tId
231
232 -- User List Flow
233 (masterUserId, _masterRootId, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
234
235 -- let gp = (GroupParams l 2 3 (StopSize 3))
236 let gp = GroupWithPosTag l CoreNLP HashMap.empty
237 ngs <- buildNgramsLists gp user userCorpusId masterCorpusId
238
239 _userListId <- flowList_DbRepo listId ngs
240 _mastListId <- getOrMkList masterCorpusId masterUserId
241 -- _ <- insertOccsUpdates userCorpusId mastListId
242 -- printDebug "userListId" userListId
243 -- User Graph Flow
244 _ <- insertDefaultNode NodeDashboard userCorpusId userId
245 _ <- insertDefaultNode NodeGraph userCorpusId userId
246 --_ <- mkPhylo userCorpusId userId
247 -- Annuaire Flow
248 -- _ <- mkAnnuaire rootUserId userId
249 pure userCorpusId
250
251
252 insertMasterDocs :: ( FlowCmdM env err m
253 , FlowCorpus a
254 , MkCorpus c
255 )
256 => Maybe c
257 -> TermType Lang
258 -> [a]
259 -> m [DocId]
260 insertMasterDocs c lang hs = do
261 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
262 (ids', documentsWithId) <- insertDocs masterUserId masterCorpusId (map (toNode masterUserId masterCorpusId) hs )
263 _ <- Doc.add masterCorpusId ids'
264 -- TODO
265 -- create a corpus with database name (CSV or PubMed)
266 -- add documents to the corpus (create node_node link)
267 -- this will enable global database monitoring
268
269 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
270 mapNgramsDocs' :: HashMap ExtractedNgrams (Map NgramsType (Map NodeId Int))
271 <- mapNodeIdNgrams
272 <$> documentIdWithNgrams
273 (extractNgramsT $ withLang lang documentsWithId)
274 documentsWithId
275
276 terms2id <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
277 let mapNgramsDocs = HashMap.mapKeys extracted2ngrams mapNgramsDocs'
278
279 -- to be removed
280 let indexedNgrams = HashMap.mapKeys (indexNgrams terms2id) mapNgramsDocs
281
282 -- new
283 lId <- getOrMkList masterCorpusId masterUserId
284 mapCgramsId <- listInsertDb lId toNodeNgramsW'
285 $ map (first _ngramsTerms . second Map.keys)
286 $ HashMap.toList mapNgramsDocs
287 -- insertDocNgrams
288 _return <- insertNodeNodeNgrams2
289 $ catMaybes [ NodeNodeNgrams2 <$> Just nId
290 <*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms'')
291 <*> Just (fromIntegral w :: Double)
292 | (terms'', mapNgramsTypes) <- HashMap.toList mapNgramsDocs
293 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
294 , (nId, w) <- Map.toList mapNodeIdWeight
295 ]
296
297 -- _cooc <- insertDefaultNode NodeListCooc lId masterUserId
298 -- to be removed
299 _ <- insertDocNgrams lId indexedNgrams
300 pure ids'
301
302 ------------------------------------------------------------------------
303 -- TODO Type NodeDocumentUnicised
304 insertDocs :: ( FlowCmdM env err m
305 -- , FlowCorpus a
306 , FlowInsertDB a
307 )
308 => UserId
309 -> CorpusId
310 -> [a]
311 -> m ([DocId], [Indexed NodeId a])
312 insertDocs uId cId hs = do
313 let docs = map addUniqId hs
314 newIds <- insertDb uId cId docs
315 -- printDebug "newIds" newIds
316 let
317 newIds' = map reId newIds
318 documentsWithId = mergeData (toInserted newIds) (Map.fromList $ map viewUniqId' docs)
319 _ <- Doc.add cId newIds'
320 pure (newIds', documentsWithId)
321
322
323 ------------------------------------------------------------------------
324 viewUniqId' :: UniqId a
325 => a
326 -> (Hash, a)
327 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
328 where
329 err = panic "[ERROR] Database.Flow.toInsert"
330
331
332 toInserted :: [ReturnId]
333 -> Map Hash ReturnId
334 toInserted =
335 Map.fromList . map (\r -> (reUniqId r, r) )
336 . filter (\r -> reInserted r == True)
337
338 mergeData :: Map Hash ReturnId
339 -> Map Hash a
340 -> [Indexed NodeId a]
341 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
342 where
343 toDocumentWithId (sha,hpd) =
344 Indexed <$> fmap reId (lookup sha rs)
345 <*> Just hpd
346
347 ------------------------------------------------------------------------
348 ------------------------------------------------------------------------
349 ------------------------------------------------------------------------
350 documentIdWithNgrams :: HasNodeError err
351 => (a
352 -> Cmd err (HashMap b (Map NgramsType Int)))
353 -> [Indexed NodeId a]
354 -> Cmd err [DocumentIdWithNgrams a b]
355 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
356 where
357 toDocumentIdWithNgrams d = do
358 e <- f $ _unIndex d
359 pure $ DocumentIdWithNgrams d e
360
361
362 -- | TODO check optimization
363 mapNodeIdNgrams :: (Ord b, Hashable b)
364 => [DocumentIdWithNgrams a b]
365 -> HashMap b
366 (Map NgramsType
367 (Map NodeId Int)
368 )
369 mapNodeIdNgrams = HashMap.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
370 where
371 f :: DocumentIdWithNgrams a b
372 -> HashMap b (Map NgramsType (Map NodeId Int))
373 f d = fmap (fmap (Map.singleton nId)) $ documentNgrams d
374 where
375 nId = _index $ documentWithId d
376
377
378 ------------------------------------------------------------------------
379 instance ExtractNgramsT HyperdataContact
380 where
381 extractNgramsT l hc = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extract l hc
382 where
383 extract :: TermType Lang -> HyperdataContact
384 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
385 extract _l hc' = do
386 let authors = map text2ngrams
387 $ maybe ["Nothing"] (\a -> [a])
388 $ view (hc_who . _Just . cw_lastName) hc'
389
390 pure $ HashMap.fromList $ [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ]
391
392
393 instance ExtractNgramsT HyperdataDocument
394 where
395 extractNgramsT :: TermType Lang
396 -> HyperdataDocument
397 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
398 extractNgramsT lang hd = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extractNgramsT' lang hd
399 where
400 extractNgramsT' :: TermType Lang
401 -> HyperdataDocument
402 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
403 extractNgramsT' lang' doc = do
404 let source = text2ngrams
405 $ maybe "Nothing" identity
406 $ _hd_source doc
407
408 institutes = map text2ngrams
409 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
410 $ _hd_institutes doc
411
412 authors = map text2ngrams
413 $ maybe ["Nothing"] (splitOn ", ")
414 $ _hd_authors doc
415
416 terms' <- map (enrichedTerms (lang' ^. tt_lang) CoreNLP NP)
417 <$> concat
418 <$> liftBase (extractTerms lang' $ hasText doc)
419
420 pure $ HashMap.fromList
421 $ [(SimpleNgrams source, Map.singleton Sources 1) ]
422 <> [(SimpleNgrams i', Map.singleton Institutes 1) | i' <- institutes ]
423 <> [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ]
424 <> [(EnrichedNgrams t', Map.singleton NgramsTerms 1) | t' <- terms' ]
425
426 instance (ExtractNgramsT a, HasText a) => ExtractNgramsT (Node a)
427 where
428 extractNgramsT l (Node _ _ _ _ _ _ _ h) = extractNgramsT l h
429
430 instance HasText a => HasText (Node a)
431 where
432 hasText (Node _ _ _ _ _ _ _ h) = hasText h
433
434