]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Flow.hs
[FIX] typos
[gargantext.git] / src / Gargantext / Database / Action / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -- TODO-ACCESS:
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
13
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
16 -}
17
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
19
20 {-# LANGUAGE ConstraintKinds #-}
21 {-# LANGUAGE ConstrainedClassMethods #-}
22 {-# LANGUAGE ConstraintKinds #-}
23 {-# LANGUAGE InstanceSigs #-}
24 {-# LANGUAGE TemplateHaskell #-}
25
26 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
27 ( FlowCmdM
28 , getDataText
29 , flowDataText
30 , flow
31
32 , flowCorpusFile
33 , flowCorpus
34 , flowAnnuaire
35 , insertMasterDocs
36
37 , getOrMkRoot
38 , getOrMk_RootWithCorpus
39 , TermType(..)
40 , DataOrigin(..)
41 , allDataOrigins
42
43 , do_api
44 )
45 where
46
47 import Control.Lens ((^.), view, _Just, makeLenses)
48 import Data.Aeson.TH (deriveJSON)
49 import Data.Either
50 import Data.List (concat)
51 import qualified Data.Map as Map
52 import Data.Map (Map, lookup)
53 import Data.Maybe (catMaybes)
54 import Data.Monoid
55 import Data.Swagger
56 import Data.Text (splitOn, intercalate)
57 import Data.Traversable (traverse)
58 import Data.Tuple.Extra (first, second)
59 import GHC.Generics (Generic)
60 import System.FilePath (FilePath)
61
62 import Gargantext.Core (Lang(..))
63 import Gargantext.Core.Ext.IMT (toSchoolName)
64 import Gargantext.Core.Ext.IMTUser (deserialiseImtUsersFromFile)
65 import Gargantext.Core.Flow.Types
66 import Gargantext.Core.Text
67 import Gargantext.Core.Text.List.Group.WithStem (StopSize(..), GroupParams(..))
68 import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat)
69 import Gargantext.Core.Text.List (buildNgramsLists)
70 import Gargantext.Core.Text.Terms
71 import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
72 import Gargantext.Core.Types (Terms(..))
73 import Gargantext.Core.Types.Individu (User(..))
74 import Gargantext.Core.Types.Main
75 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
76 import Gargantext.Database.Action.Flow.List
77 import Gargantext.Database.Action.Flow.Types
78 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams)
79 import Gargantext.Database.Action.Search (searchDocInDatabase)
80 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
81 import Gargantext.Database.Admin.Types.Hyperdata
82 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
83 import Gargantext.Database.Prelude
84 import Gargantext.Database.Query.Table.Ngrams
85 import Gargantext.Database.Query.Table.Node
86 import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
87 import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
88 import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
89 import Gargantext.Database.Query.Table.NodeNodeNgrams2
90 import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
91 import Gargantext.Database.Schema.Node (NodePoly(..))
92 import Gargantext.Prelude
93 import Gargantext.Prelude.Crypto.Hash (Hash)
94 import qualified Gargantext.Core.Text.Corpus.API as API
95 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
96
97 ------------------------------------------------------------------------
98 -- TODO use internal with API name (could be old data)
99 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
100 | ExternalOrigin { _do_api :: API.ExternalAPIs }
101 -- TODO Web
102 deriving (Generic, Eq)
103
104 makeLenses ''DataOrigin
105 deriveJSON (unPrefix "_do_") ''DataOrigin
106 instance ToSchema DataOrigin where
107 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
108
109 allDataOrigins :: [DataOrigin]
110 allDataOrigins = map InternalOrigin API.externalAPIs
111 <> map ExternalOrigin API.externalAPIs
112
113 ---------------
114 data DataText = DataOld ![NodeId]
115 | DataNew ![[HyperdataDocument]]
116
117 -- TODO use the split parameter in config file
118 getDataText :: FlowCmdM env err m
119 => DataOrigin
120 -> TermType Lang
121 -> API.Query
122 -> Maybe API.Limit
123 -> m DataText
124 getDataText (ExternalOrigin api) la q li = liftBase $ DataNew
125 <$> splitEvery 500
126 <$> API.get api (_tt_lang la) q li
127
128 getDataText (InternalOrigin _) _la q _li = do
129 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
130 (UserName userMaster)
131 (Left "")
132 (Nothing :: Maybe HyperdataCorpus)
133 ids <- map fst <$> searchDocInDatabase cId (stemIt q)
134 pure $ DataOld ids
135
136 -------------------------------------------------------------------------------
137 flowDataText :: ( FlowCmdM env err m
138 )
139 => User
140 -> DataText
141 -> TermType Lang
142 -> CorpusId
143 -> m CorpusId
144 flowDataText u (DataOld ids) tt cid = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids
145 where
146 corpusType = (Nothing :: Maybe HyperdataCorpus)
147 flowDataText u (DataNew txt) tt cid = flowCorpus u (Right [cid]) tt txt
148
149 ------------------------------------------------------------------------
150 -- TODO use proxy
151 flowAnnuaire :: (FlowCmdM env err m)
152 => User
153 -> Either CorpusName [CorpusId]
154 -> (TermType Lang)
155 -> FilePath
156 -> m AnnuaireId
157 flowAnnuaire u n l filePath = do
158 docs <- liftBase $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
159 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
160
161 ------------------------------------------------------------------------
162 flowCorpusFile :: (FlowCmdM env err m)
163 => User
164 -> Either CorpusName [CorpusId]
165 -> Limit -- Limit the number of docs (for dev purpose)
166 -> TermType Lang -> FileFormat -> FilePath
167 -> m CorpusId
168 flowCorpusFile u n l la ff fp = do
169 docs <- liftBase ( splitEvery 500
170 <$> take l
171 <$> parseFile ff fp
172 )
173 flowCorpus u n la (map (map toHyperdataDocument) docs)
174
175 ------------------------------------------------------------------------
176 -- | TODO improve the needed type to create/update a corpus
177 -- (For now, Either is enough)
178 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
179 => User
180 -> Either CorpusName [CorpusId]
181 -> TermType Lang
182 -> [[a]]
183 -> m CorpusId
184 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
185
186
187 flow :: ( FlowCmdM env err m
188 , FlowCorpus a
189 , MkCorpus c
190 )
191 => Maybe c
192 -> User
193 -> Either CorpusName [CorpusId]
194 -> TermType Lang
195 -> [[a]]
196 -> m CorpusId
197 flow c u cn la docs = do
198 -- TODO if public insertMasterDocs else insertUserDocs
199 ids <- traverse (insertMasterDocs c la) docs
200 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
201
202 ------------------------------------------------------------------------
203 flowCorpusUser :: ( FlowCmdM env err m
204 , MkCorpus c
205 )
206 => Lang
207 -> User
208 -> Either CorpusName [CorpusId]
209 -> Maybe c
210 -> [NodeId]
211 -> m CorpusId
212 flowCorpusUser l user corpusName ctype ids = do
213 -- User Flow
214 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
215 -- NodeTexts is first
216 _tId <- insertDefaultNode NodeTexts userCorpusId userId
217 -- printDebug "NodeTexts: " tId
218
219 -- NodeList is second
220 listId <- getOrMkList userCorpusId userId
221 -- _cooc <- insertDefaultNode NodeListCooc listId userId
222 -- TODO: check if present already, ignore
223 _ <- Doc.add userCorpusId ids
224
225 -- printDebug "Node Text Ids:" tId
226
227 -- User List Flow
228 (masterUserId, _masterRootId, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
229 ngs <- buildNgramsLists user (GroupParams l 2 3 (StopSize 3)) userCorpusId masterCorpusId
230 _userListId <- flowList_DbRepo listId ngs
231 _mastListId <- getOrMkList masterCorpusId masterUserId
232 -- _ <- insertOccsUpdates userCorpusId mastListId
233 -- printDebug "userListId" userListId
234 -- User Graph Flow
235 _ <- insertDefaultNode NodeDashboard userCorpusId userId
236 _ <- insertDefaultNode NodeGraph userCorpusId userId
237 --_ <- mkPhylo userCorpusId userId
238 -- Annuaire Flow
239 -- _ <- mkAnnuaire rootUserId userId
240 pure userCorpusId
241
242
243 insertMasterDocs :: ( FlowCmdM env err m
244 , FlowCorpus a
245 , MkCorpus c
246 )
247 => Maybe c
248 -> TermType Lang
249 -> [a]
250 -> m [DocId]
251 insertMasterDocs c lang hs = do
252 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
253 (ids', documentsWithId) <- insertDocs masterUserId masterCorpusId (map (toNode masterUserId masterCorpusId) hs )
254 _ <- Doc.add masterCorpusId ids'
255 -- TODO
256 -- create a corpus with database name (CSV or PubMed)
257 -- add documents to the corpus (create node_node link)
258 -- this will enable global database monitoring
259
260 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
261 mapNgramsDocs <- mapNodeIdNgrams
262 <$> documentIdWithNgrams
263 (extractNgramsT $ withLang lang documentsWithId)
264 documentsWithId
265
266 terms2id <- insertNgrams $ Map.keys mapNgramsDocs
267 -- to be removed
268 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) mapNgramsDocs
269
270 -- new
271 lId <- getOrMkList masterCorpusId masterUserId
272 mapCgramsId <- listInsertDb lId toNodeNgramsW'
273 $ map (first _ngramsTerms . second Map.keys)
274 $ Map.toList mapNgramsDocs
275 -- insertDocNgrams
276 _return <- insertNodeNodeNgrams2
277 $ catMaybes [ NodeNodeNgrams2 <$> Just nId
278 <*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms'')
279 <*> Just (fromIntegral w :: Double)
280 | (terms'', mapNgramsTypes) <- Map.toList mapNgramsDocs
281 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
282 , (nId, w) <- Map.toList mapNodeIdWeight
283 ]
284
285 -- _cooc <- insertDefaultNode NodeListCooc lId masterUserId
286 -- to be removed
287 _ <- insertDocNgrams lId indexedNgrams
288 pure ids'
289
290 ------------------------------------------------------------------------
291 -- TODO Type NodeDocumentUnicised
292 insertDocs :: ( FlowCmdM env err m
293 -- , FlowCorpus a
294 , FlowInsertDB a
295 )
296 => UserId
297 -> CorpusId
298 -> [a]
299 -> m ([DocId], [DocumentWithId a])
300 insertDocs uId cId hs = do
301 let docs = map addUniqId hs
302 newIds <- insertDb uId cId docs
303 -- printDebug "newIds" newIds
304 let
305 newIds' = map reId newIds
306 documentsWithId = mergeData (toInserted newIds) (Map.fromList $ map viewUniqId' docs)
307 _ <- Doc.add cId newIds'
308 pure (newIds', documentsWithId)
309
310
311
312 ------------------------------------------------------------------------
313 viewUniqId' :: UniqId a
314 => a
315 -> (Hash, a)
316 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
317 where
318 err = panic "[ERROR] Database.Flow.toInsert"
319
320
321 toInserted :: [ReturnId]
322 -> Map Hash ReturnId
323 toInserted =
324 Map.fromList . map (\r -> (reUniqId r, r) )
325 . filter (\r -> reInserted r == True)
326
327 mergeData :: Map Hash ReturnId
328 -> Map Hash a
329 -> [DocumentWithId a]
330 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
331 where
332 toDocumentWithId (sha,hpd) =
333 DocumentWithId <$> fmap reId (lookup sha rs)
334 <*> Just hpd
335
336 ------------------------------------------------------------------------
337 instance HasText HyperdataContact
338 where
339 hasText = undefined
340 ------------------------------------------------------------------------
341 ------------------------------------------------------------------------
342 documentIdWithNgrams :: HasNodeError err
343 => (a
344 -> Cmd err (Map Ngrams (Map NgramsType Int)))
345 -> [DocumentWithId a]
346 -> Cmd err [DocumentIdWithNgrams a]
347 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
348 where
349 toDocumentIdWithNgrams d = do
350 e <- f $ documentData d
351 pure $ DocumentIdWithNgrams d e
352
353 ------------------------------------------------------------------------
354 instance ExtractNgramsT HyperdataContact
355 where
356 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
357 where
358 extract :: TermType Lang -> HyperdataContact
359 -> Cmd err (Map Ngrams (Map NgramsType Int))
360 extract _l hc' = do
361 let authors = map text2ngrams
362 $ maybe ["Nothing"] (\a -> [a])
363 $ view (hc_who . _Just . cw_lastName) hc'
364
365 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
366
367 instance HasText HyperdataDocument
368 where
369 hasText h = catMaybes [ _hd_title h
370 , _hd_abstract h
371 ]
372
373
374 instance ExtractNgramsT HyperdataDocument
375 where
376 extractNgramsT :: TermType Lang
377 -> HyperdataDocument
378 -> Cmd err (Map Ngrams (Map NgramsType Int))
379 extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
380 where
381 extractNgramsT' :: TermType Lang
382 -> HyperdataDocument
383 -> Cmd err (Map Ngrams (Map NgramsType Int))
384 extractNgramsT' lang' doc = do
385 let source = text2ngrams
386 $ maybe "Nothing" identity
387 $ _hd_source doc
388
389 institutes = map text2ngrams
390 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
391 $ _hd_institutes doc
392
393 authors = map text2ngrams
394 $ maybe ["Nothing"] (splitOn ", ")
395 $ _hd_authors doc
396
397 terms' <- map text2ngrams
398 <$> map (intercalate " " . _terms_label)
399 <$> concat
400 <$> liftBase (extractTerms lang' $ hasText doc)
401
402 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
403 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
404 <> [(a', Map.singleton Authors 1) | a' <- authors ]
405 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
406
407 instance (ExtractNgramsT a, HasText a) => ExtractNgramsT (Node a)
408 where
409 extractNgramsT l (Node _ _ _ _ _ _ _ h) = extractNgramsT l h
410
411 instance HasText a => HasText (Node a)
412 where
413 hasText (Node _ _ _ _ _ _ _ h) = hasText h
414
415