]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Flow.hs
[FIX] Optim count table
[gargantext.git] / src / Gargantext / Database / Action / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -- TODO-ACCESS:
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
13
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
16 -}
17
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
19
20 {-# LANGUAGE ConstraintKinds #-}
21 {-# LANGUAGE ConstrainedClassMethods #-}
22 {-# LANGUAGE ConstraintKinds #-}
23 {-# LANGUAGE InstanceSigs #-}
24 {-# LANGUAGE TemplateHaskell #-}
25
26 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
27 ( FlowCmdM
28 , getDataText
29 , flowDataText
30 , flow
31
32 , flowCorpusFile
33 , flowCorpus
34 , flowAnnuaire
35 , insertMasterDocs
36
37 , getOrMkRoot
38 , getOrMk_RootWithCorpus
39 , TermType(..)
40 , DataOrigin(..)
41 , allDataOrigins
42
43 , do_api
44 )
45 where
46
47 import Control.Lens ((^.), view, _Just, makeLenses)
48 import Data.Aeson.TH (deriveJSON)
49 import Data.Either
50 import Data.List (concat)
51 import qualified Data.Map as Map
52 import Data.Map (Map, lookup)
53 import Data.Maybe (catMaybes)
54 import Data.Monoid
55 import Data.Swagger
56 import Data.Text (splitOn, intercalate)
57 import Data.Traversable (traverse)
58 import Data.Tuple.Extra (first, second)
59 import GHC.Generics (Generic)
60 import System.FilePath (FilePath)
61
62 import Gargantext.Core (Lang(..))
63 import Gargantext.Core.Ext.IMT (toSchoolName)
64 import Gargantext.Core.Ext.IMTUser (deserialiseImtUsersFromFile)
65 import Gargantext.Core.Flow.Types
66 import Gargantext.Core.Text
67 import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat)
68 import Gargantext.Core.Text.List (buildNgramsLists,StopSize(..))
69 import Gargantext.Core.Text.Terms
70 import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
71 import Gargantext.Core.Types (Terms(..))
72 import Gargantext.Core.Types.Individu (User(..))
73 import Gargantext.Core.Types.Main
74 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
75 import Gargantext.Database.Action.Flow.List
76 import Gargantext.Database.Action.Flow.Types
77 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams)
78 import Gargantext.Database.Action.Search (searchDocInDatabase)
79 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
80 import Gargantext.Database.Admin.Types.Hyperdata
81 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
82 import Gargantext.Database.Query.Table.Node
83 import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
84 import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
85 import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
86 import Gargantext.Database.Query.Table.Ngrams
87 import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
88 import Gargantext.Database.Query.Table.NodeNodeNgrams2
89 import Gargantext.Database.Prelude
90 import Gargantext.Database.Schema.Node (NodePoly(..))
91 import Gargantext.Prelude
92 import Gargantext.Prelude.Crypto.Hash (Hash)
93 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
94 import qualified Gargantext.Core.Text.Corpus.API as API
95
96 ------------------------------------------------------------------------
97 -- TODO use internal with API name (could be old data)
98 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
99 | ExternalOrigin { _do_api :: API.ExternalAPIs }
100 -- TODO Web
101 deriving (Generic, Eq)
102
103 makeLenses ''DataOrigin
104 deriveJSON (unPrefix "_do_") ''DataOrigin
105 instance ToSchema DataOrigin where
106 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
107
108 allDataOrigins :: [DataOrigin]
109 allDataOrigins = map InternalOrigin API.externalAPIs
110 <> map ExternalOrigin API.externalAPIs
111
112 ---------------
113 data DataText = DataOld ![NodeId]
114 | DataNew ![[HyperdataDocument]]
115
116 -- TODO use the split parameter in config file
117 getDataText :: FlowCmdM env err m
118 => DataOrigin
119 -> TermType Lang
120 -> API.Query
121 -> Maybe API.Limit
122 -> m DataText
123 getDataText (ExternalOrigin api) la q li = liftBase $ DataNew
124 <$> splitEvery 500
125 <$> API.get api (_tt_lang la) q li
126 getDataText (InternalOrigin _) _la q _li = do
127 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
128 (UserName userMaster)
129 (Left "")
130 (Nothing :: Maybe HyperdataCorpus)
131 ids <- map fst <$> searchDocInDatabase cId (stemIt q)
132 pure $ DataOld ids
133
134 -------------------------------------------------------------------------------
135 flowDataText :: FlowCmdM env err m
136 => User
137 -> DataText
138 -> TermType Lang
139 -> CorpusId
140 -> m CorpusId
141 flowDataText u (DataOld ids) tt cid = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids
142 where
143 corpusType = (Nothing :: Maybe HyperdataCorpus)
144 flowDataText u (DataNew txt) tt cid = flowCorpus u (Right [cid]) tt txt
145
146 ------------------------------------------------------------------------
147 -- TODO use proxy
148 flowAnnuaire :: FlowCmdM env err m
149 => User
150 -> Either CorpusName [CorpusId]
151 -> (TermType Lang)
152 -> FilePath
153 -> m AnnuaireId
154 flowAnnuaire u n l filePath = do
155 docs <- liftBase $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
156 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
157
158 ------------------------------------------------------------------------
159 flowCorpusFile :: FlowCmdM env err m
160 => User
161 -> Either CorpusName [CorpusId]
162 -> Limit -- Limit the number of docs (for dev purpose)
163 -> TermType Lang -> FileFormat -> FilePath
164 -> m CorpusId
165 flowCorpusFile u n l la ff fp = do
166 docs <- liftBase ( splitEvery 500
167 <$> take l
168 <$> parseFile ff fp
169 )
170 flowCorpus u n la (map (map toHyperdataDocument) docs)
171
172 ------------------------------------------------------------------------
173 -- | TODO improve the needed type to create/update a corpus
174 -- (For now, Either is enough)
175 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
176 => User
177 -> Either CorpusName [CorpusId]
178 -> TermType Lang
179 -> [[a]]
180 -> m CorpusId
181 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
182
183
184 flow :: (FlowCmdM env err m, FlowCorpus a, MkCorpus c)
185 => Maybe c
186 -> User
187 -> Either CorpusName [CorpusId]
188 -> TermType Lang
189 -> [[a]]
190 -> m CorpusId
191 flow c u cn la docs = do
192 -- TODO if public insertMasterDocs else insertUserDocs
193 ids <- traverse (insertMasterDocs c la) docs
194 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
195
196 ------------------------------------------------------------------------
197 flowCorpusUser :: (FlowCmdM env err m, MkCorpus c)
198 => Lang
199 -> User
200 -> Either CorpusName [CorpusId]
201 -> Maybe c
202 -> [NodeId]
203 -> m CorpusId
204 flowCorpusUser l user corpusName ctype ids = do
205 -- User Flow
206 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
207 listId <- getOrMkList userCorpusId userId
208 -- _cooc <- insertDefaultNode NodeListCooc listId userId
209 -- TODO: check if present already, ignore
210 _ <- Doc.add userCorpusId ids
211
212 _tId <- insertDefaultNode NodeTexts userCorpusId userId
213 -- printDebug "Node Text Ids:" tId
214
215 -- User List Flow
216 (masterUserId, _masterRootId, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
217 ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
218 _userListId <- flowList_DbRepo listId ngs
219 _mastListId <- getOrMkList masterCorpusId masterUserId
220 -- _ <- insertOccsUpdates userCorpusId mastListId
221 -- printDebug "userListId" userListId
222 -- User Graph Flow
223 _ <- insertDefaultNode NodeDashboard userCorpusId userId
224 _ <- insertDefaultNode NodeGraph userCorpusId userId
225 --_ <- mkPhylo userCorpusId userId
226 -- Annuaire Flow
227 -- _ <- mkAnnuaire rootUserId userId
228 pure userCorpusId
229
230
231 insertMasterDocs :: ( FlowCmdM env err m
232 , FlowCorpus a
233 , MkCorpus c
234 )
235 => Maybe c
236 -> TermType Lang
237 -> [a]
238 -> m [DocId]
239 insertMasterDocs c lang hs = do
240 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
241 (ids', documentsWithId) <- insertDocs masterUserId masterCorpusId (map (toNode masterUserId masterCorpusId) hs )
242 _ <- Doc.add masterCorpusId ids'
243 -- TODO
244 -- create a corpus with database name (CSV or PubMed)
245 -- add documents to the corpus (create node_node link)
246 -- this will enable global database monitoring
247
248 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
249 mapNgramsDocs <- mapNodeIdNgrams
250 <$> documentIdWithNgrams (extractNgramsT $ withLang lang documentsWithId) documentsWithId
251
252 terms2id <- insertNgrams $ Map.keys mapNgramsDocs
253 -- to be removed
254 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) mapNgramsDocs
255
256 -- new
257 lId <- getOrMkList masterCorpusId masterUserId
258 mapCgramsId <- listInsertDb lId toNodeNgramsW'
259 $ map (first _ngramsTerms . second Map.keys)
260 $ Map.toList mapNgramsDocs
261 -- insertDocNgrams
262 _return <- insertNodeNodeNgrams2
263 $ catMaybes [ NodeNodeNgrams2 <$> Just nId
264 <*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms'')
265 <*> Just (fromIntegral w :: Double)
266 | (terms'', mapNgramsTypes) <- Map.toList mapNgramsDocs
267 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
268 , (nId, w) <- Map.toList mapNodeIdWeight
269 ]
270
271 -- _cooc <- insertDefaultNode NodeListCooc lId masterUserId
272 -- to be removed
273 _ <- insertDocNgrams lId indexedNgrams
274 pure ids'
275
276 ------------------------------------------------------------------------
277 -- TODO Type NodeDocumentUnicised
278 insertDocs :: ( FlowCmdM env err m
279 -- , FlowCorpus a
280 , FlowInsertDB a
281 )
282 => UserId
283 -> CorpusId
284 -> [a]
285 -> m ([DocId], [DocumentWithId a])
286 insertDocs uId cId hs = do
287 let docs = map addUniqId hs
288 newIds <- insertDb uId cId docs
289 -- printDebug "newIds" newIds
290 let
291 newIds' = map reId newIds
292 documentsWithId = mergeData (toInserted newIds) (Map.fromList $ map viewUniqId' docs)
293 _ <- Doc.add cId newIds'
294 pure (newIds', documentsWithId)
295
296
297
298 ------------------------------------------------------------------------
299 viewUniqId' :: UniqId a
300 => a
301 -> (Hash, a)
302 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
303 where
304 err = panic "[ERROR] Database.Flow.toInsert"
305
306
307 toInserted :: [ReturnId]
308 -> Map Hash ReturnId
309 toInserted =
310 Map.fromList . map (\r -> (reUniqId r, r) )
311 . filter (\r -> reInserted r == True)
312
313 mergeData :: Map Hash ReturnId
314 -> Map Hash a
315 -> [DocumentWithId a]
316 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
317 where
318 toDocumentWithId (sha,hpd) =
319 DocumentWithId <$> fmap reId (lookup sha rs)
320 <*> Just hpd
321
322 ------------------------------------------------------------------------
323 instance HasText HyperdataContact
324 where
325 hasText = undefined
326 ------------------------------------------------------------------------
327 ------------------------------------------------------------------------
328 documentIdWithNgrams :: HasNodeError err
329 => (a
330 -> Cmd err (Map Ngrams (Map NgramsType Int)))
331 -> [DocumentWithId a]
332 -> Cmd err [DocumentIdWithNgrams a]
333 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
334 where
335 toDocumentIdWithNgrams d = do
336 e <- f $ documentData d
337 pure $ DocumentIdWithNgrams d e
338
339 ------------------------------------------------------------------------
340 instance ExtractNgramsT HyperdataContact
341 where
342 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
343 where
344 extract :: TermType Lang -> HyperdataContact
345 -> Cmd err (Map Ngrams (Map NgramsType Int))
346 extract _l hc' = do
347 let authors = map text2ngrams
348 $ maybe ["Nothing"] (\a -> [a])
349 $ view (hc_who . _Just . cw_lastName) hc'
350
351 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
352
353 instance HasText HyperdataDocument
354 where
355 hasText h = catMaybes [ _hd_title h
356 , _hd_abstract h
357 ]
358
359
360 instance ExtractNgramsT HyperdataDocument
361 where
362 extractNgramsT :: TermType Lang
363 -> HyperdataDocument
364 -> Cmd err (Map Ngrams (Map NgramsType Int))
365 extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
366 where
367 extractNgramsT' :: TermType Lang
368 -> HyperdataDocument
369 -> Cmd err (Map Ngrams (Map NgramsType Int))
370 extractNgramsT' lang' doc = do
371 let source = text2ngrams
372 $ maybe "Nothing" identity
373 $ _hd_source doc
374
375 institutes = map text2ngrams
376 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
377 $ _hd_institutes doc
378
379 authors = map text2ngrams
380 $ maybe ["Nothing"] (splitOn ", ")
381 $ _hd_authors doc
382
383 terms' <- map text2ngrams
384 <$> map (intercalate " " . _terms_label)
385 <$> concat
386 <$> liftBase (extractTerms lang' $ hasText doc)
387
388 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
389 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
390 <> [(a', Map.singleton Authors 1) | a' <- authors ]
391 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
392
393 instance (ExtractNgramsT a, HasText a) => ExtractNgramsT (Node a)
394 where
395 extractNgramsT l (Node _ _ _ _ _ _ _ h) = extractNgramsT l h
396
397 instance HasText a => HasText (Node a)
398 where
399 hasText (Node _ _ _ _ _ _ _ h) = hasText h
400
401