]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Flow.hs
merging phylo with uniform distribution of terms in quality AND selection of eligible...
[gargantext.git] / src / Gargantext / Database / Action / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -- TODO-ACCESS:
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
13
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
16 -}
17
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
19
20 {-# LANGUAGE ConstraintKinds #-}
21 {-# LANGUAGE ConstrainedClassMethods #-}
22 {-# LANGUAGE ConstraintKinds #-}
23 {-# LANGUAGE InstanceSigs #-}
24 {-# LANGUAGE TemplateHaskell #-}
25
26 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
27 ( FlowCmdM
28 , getDataText
29 , flowDataText
30 , flow
31
32 , flowCorpusFile
33 , flowCorpus
34 , flowAnnuaire
35 , insertMasterDocs
36
37 , getOrMkRoot
38 , getOrMk_RootWithCorpus
39 , TermType(..)
40 , DataOrigin(..)
41 , allDataOrigins
42
43 , do_api
44 )
45 where
46
47 import Control.Lens ((^.), view, _Just, makeLenses)
48 import Data.Aeson.TH (deriveJSON)
49 import Data.Either
50 import Data.List (concat)
51 import qualified Data.Map as Map
52 import Data.Map (Map, lookup)
53 import Data.Maybe (catMaybes)
54 import Data.Monoid
55 import Data.Swagger
56 import Data.Text (splitOn, intercalate)
57 import Data.Traversable (traverse)
58 import Data.Tuple.Extra (first, second)
59 import GHC.Generics (Generic)
60 import System.FilePath (FilePath)
61
62 import Gargantext.Core (Lang(..))
63 import Gargantext.Core.Ext.IMT (toSchoolName)
64 import Gargantext.Core.Ext.IMTUser (deserialiseImtUsersFromFile)
65 import Gargantext.Core.Flow.Types
66 import Gargantext.Core.Text
67 import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat)
68 import Gargantext.Core.Text.List (buildNgramsLists)
69 import Gargantext.Core.Text.Group (StopSize(..), GroupParams(..))
70 import Gargantext.Core.Text.Terms
71 import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
72 import Gargantext.Core.Types (Terms(..))
73 import Gargantext.Core.Types.Individu (User(..))
74 import Gargantext.Core.Types.Main
75 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
76 import Gargantext.Database.Action.Flow.List
77 import Gargantext.Database.Action.Flow.Types
78 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams)
79 import Gargantext.Database.Action.Search (searchDocInDatabase)
80 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
81 import Gargantext.Database.Admin.Types.Hyperdata
82 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
83 import Gargantext.Database.Prelude
84 import Gargantext.Database.Query.Table.Ngrams
85 import Gargantext.Database.Query.Table.Node
86 import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
87 import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
88 import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
89 import Gargantext.Database.Query.Table.NodeNodeNgrams2
90 import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
91 import Gargantext.Database.Schema.Node (NodePoly(..))
92 import Gargantext.Prelude
93 import Gargantext.Prelude.Crypto.Hash (Hash)
94 import qualified Gargantext.Core.Text.Corpus.API as API
95 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
96
97 ------------------------------------------------------------------------
98 -- TODO use internal with API name (could be old data)
99 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
100 | ExternalOrigin { _do_api :: API.ExternalAPIs }
101 -- TODO Web
102 deriving (Generic, Eq)
103
104 makeLenses ''DataOrigin
105 deriveJSON (unPrefix "_do_") ''DataOrigin
106 instance ToSchema DataOrigin where
107 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
108
109 allDataOrigins :: [DataOrigin]
110 allDataOrigins = map InternalOrigin API.externalAPIs
111 <> map ExternalOrigin API.externalAPIs
112
113 ---------------
114 data DataText = DataOld ![NodeId]
115 | DataNew ![[HyperdataDocument]]
116
117 -- TODO use the split parameter in config file
118 getDataText :: FlowCmdM env err m
119 => DataOrigin
120 -> TermType Lang
121 -> API.Query
122 -> Maybe API.Limit
123 -> m DataText
124 getDataText (ExternalOrigin api) la q li = liftBase $ DataNew
125 <$> splitEvery 500
126 <$> API.get api (_tt_lang la) q li
127 getDataText (InternalOrigin _) _la q _li = do
128 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
129 (UserName userMaster)
130 (Left "")
131 (Nothing :: Maybe HyperdataCorpus)
132 ids <- map fst <$> searchDocInDatabase cId (stemIt q)
133 pure $ DataOld ids
134
135 -------------------------------------------------------------------------------
136 flowDataText :: ( FlowCmdM env err m
137 )
138 => User
139 -> DataText
140 -> TermType Lang
141 -> CorpusId
142 -> m CorpusId
143 flowDataText u (DataOld ids) tt cid = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids
144 where
145 corpusType = (Nothing :: Maybe HyperdataCorpus)
146 flowDataText u (DataNew txt) tt cid = flowCorpus u (Right [cid]) tt txt
147
148 ------------------------------------------------------------------------
149 -- TODO use proxy
150 flowAnnuaire :: (FlowCmdM env err m)
151 => User
152 -> Either CorpusName [CorpusId]
153 -> (TermType Lang)
154 -> FilePath
155 -> m AnnuaireId
156 flowAnnuaire u n l filePath = do
157 docs <- liftBase $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
158 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
159
160 ------------------------------------------------------------------------
161 flowCorpusFile :: (FlowCmdM env err m)
162 => User
163 -> Either CorpusName [CorpusId]
164 -> Limit -- Limit the number of docs (for dev purpose)
165 -> TermType Lang -> FileFormat -> FilePath
166 -> m CorpusId
167 flowCorpusFile u n l la ff fp = do
168 docs <- liftBase ( splitEvery 500
169 <$> take l
170 <$> parseFile ff fp
171 )
172 flowCorpus u n la (map (map toHyperdataDocument) docs)
173
174 ------------------------------------------------------------------------
175 -- | TODO improve the needed type to create/update a corpus
176 -- (For now, Either is enough)
177 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
178 => User
179 -> Either CorpusName [CorpusId]
180 -> TermType Lang
181 -> [[a]]
182 -> m CorpusId
183 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
184
185
186 flow :: ( FlowCmdM env err m
187 , FlowCorpus a
188 , MkCorpus c
189 )
190 => Maybe c
191 -> User
192 -> Either CorpusName [CorpusId]
193 -> TermType Lang
194 -> [[a]]
195 -> m CorpusId
196 flow c u cn la docs = do
197 -- TODO if public insertMasterDocs else insertUserDocs
198 ids <- traverse (insertMasterDocs c la) docs
199 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
200
201 ------------------------------------------------------------------------
202 flowCorpusUser :: ( FlowCmdM env err m
203 , MkCorpus c
204 )
205 => Lang
206 -> User
207 -> Either CorpusName [CorpusId]
208 -> Maybe c
209 -> [NodeId]
210 -> m CorpusId
211 flowCorpusUser l user corpusName ctype ids = do
212 -- User Flow
213 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
214 listId <- getOrMkList userCorpusId userId
215 -- _cooc <- insertDefaultNode NodeListCooc listId userId
216 -- TODO: check if present already, ignore
217 _ <- Doc.add userCorpusId ids
218
219 _tId <- insertDefaultNode NodeTexts userCorpusId userId
220 -- printDebug "Node Text Ids:" tId
221
222 -- User List Flow
223 (masterUserId, _masterRootId, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
224 ngs <- buildNgramsLists user (GroupParams l 2 3 (StopSize 3)) userCorpusId masterCorpusId
225 _userListId <- flowList_DbRepo listId ngs
226 _mastListId <- getOrMkList masterCorpusId masterUserId
227 -- _ <- insertOccsUpdates userCorpusId mastListId
228 -- printDebug "userListId" userListId
229 -- User Graph Flow
230 _ <- insertDefaultNode NodeDashboard userCorpusId userId
231 _ <- insertDefaultNode NodeGraph userCorpusId userId
232 --_ <- mkPhylo userCorpusId userId
233 -- Annuaire Flow
234 -- _ <- mkAnnuaire rootUserId userId
235 pure userCorpusId
236
237
238 insertMasterDocs :: ( FlowCmdM env err m
239 , FlowCorpus a
240 , MkCorpus c
241 )
242 => Maybe c
243 -> TermType Lang
244 -> [a]
245 -> m [DocId]
246 insertMasterDocs c lang hs = do
247 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
248 (ids', documentsWithId) <- insertDocs masterUserId masterCorpusId (map (toNode masterUserId masterCorpusId) hs )
249 _ <- Doc.add masterCorpusId ids'
250 -- TODO
251 -- create a corpus with database name (CSV or PubMed)
252 -- add documents to the corpus (create node_node link)
253 -- this will enable global database monitoring
254
255 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
256 mapNgramsDocs <- mapNodeIdNgrams
257 <$> documentIdWithNgrams (extractNgramsT $ withLang lang documentsWithId) documentsWithId
258
259 terms2id <- insertNgrams $ Map.keys mapNgramsDocs
260 -- to be removed
261 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) mapNgramsDocs
262
263 -- new
264 lId <- getOrMkList masterCorpusId masterUserId
265 mapCgramsId <- listInsertDb lId toNodeNgramsW'
266 $ map (first _ngramsTerms . second Map.keys)
267 $ Map.toList mapNgramsDocs
268 -- insertDocNgrams
269 _return <- insertNodeNodeNgrams2
270 $ catMaybes [ NodeNodeNgrams2 <$> Just nId
271 <*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms'')
272 <*> Just (fromIntegral w :: Double)
273 | (terms'', mapNgramsTypes) <- Map.toList mapNgramsDocs
274 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
275 , (nId, w) <- Map.toList mapNodeIdWeight
276 ]
277
278 -- _cooc <- insertDefaultNode NodeListCooc lId masterUserId
279 -- to be removed
280 _ <- insertDocNgrams lId indexedNgrams
281 pure ids'
282
283 ------------------------------------------------------------------------
284 -- TODO Type NodeDocumentUnicised
285 insertDocs :: ( FlowCmdM env err m
286 -- , FlowCorpus a
287 , FlowInsertDB a
288 )
289 => UserId
290 -> CorpusId
291 -> [a]
292 -> m ([DocId], [DocumentWithId a])
293 insertDocs uId cId hs = do
294 let docs = map addUniqId hs
295 newIds <- insertDb uId cId docs
296 -- printDebug "newIds" newIds
297 let
298 newIds' = map reId newIds
299 documentsWithId = mergeData (toInserted newIds) (Map.fromList $ map viewUniqId' docs)
300 _ <- Doc.add cId newIds'
301 pure (newIds', documentsWithId)
302
303
304
305 ------------------------------------------------------------------------
306 viewUniqId' :: UniqId a
307 => a
308 -> (Hash, a)
309 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
310 where
311 err = panic "[ERROR] Database.Flow.toInsert"
312
313
314 toInserted :: [ReturnId]
315 -> Map Hash ReturnId
316 toInserted =
317 Map.fromList . map (\r -> (reUniqId r, r) )
318 . filter (\r -> reInserted r == True)
319
320 mergeData :: Map Hash ReturnId
321 -> Map Hash a
322 -> [DocumentWithId a]
323 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
324 where
325 toDocumentWithId (sha,hpd) =
326 DocumentWithId <$> fmap reId (lookup sha rs)
327 <*> Just hpd
328
329 ------------------------------------------------------------------------
330 instance HasText HyperdataContact
331 where
332 hasText = undefined
333 ------------------------------------------------------------------------
334 ------------------------------------------------------------------------
335 documentIdWithNgrams :: HasNodeError err
336 => (a
337 -> Cmd err (Map Ngrams (Map NgramsType Int)))
338 -> [DocumentWithId a]
339 -> Cmd err [DocumentIdWithNgrams a]
340 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
341 where
342 toDocumentIdWithNgrams d = do
343 e <- f $ documentData d
344 pure $ DocumentIdWithNgrams d e
345
346 ------------------------------------------------------------------------
347 instance ExtractNgramsT HyperdataContact
348 where
349 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
350 where
351 extract :: TermType Lang -> HyperdataContact
352 -> Cmd err (Map Ngrams (Map NgramsType Int))
353 extract _l hc' = do
354 let authors = map text2ngrams
355 $ maybe ["Nothing"] (\a -> [a])
356 $ view (hc_who . _Just . cw_lastName) hc'
357
358 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
359
360 instance HasText HyperdataDocument
361 where
362 hasText h = catMaybes [ _hd_title h
363 , _hd_abstract h
364 ]
365
366
367 instance ExtractNgramsT HyperdataDocument
368 where
369 extractNgramsT :: TermType Lang
370 -> HyperdataDocument
371 -> Cmd err (Map Ngrams (Map NgramsType Int))
372 extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
373 where
374 extractNgramsT' :: TermType Lang
375 -> HyperdataDocument
376 -> Cmd err (Map Ngrams (Map NgramsType Int))
377 extractNgramsT' lang' doc = do
378 let source = text2ngrams
379 $ maybe "Nothing" identity
380 $ _hd_source doc
381
382 institutes = map text2ngrams
383 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
384 $ _hd_institutes doc
385
386 authors = map text2ngrams
387 $ maybe ["Nothing"] (splitOn ", ")
388 $ _hd_authors doc
389
390 terms' <- map text2ngrams
391 <$> map (intercalate " " . _terms_label)
392 <$> concat
393 <$> liftBase (extractTerms lang' $ hasText doc)
394
395 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
396 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
397 <> [(a', Map.singleton Authors 1) | a' <- authors ]
398 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
399
400 instance (ExtractNgramsT a, HasText a) => ExtractNgramsT (Node a)
401 where
402 extractNgramsT l (Node _ _ _ _ _ _ _ h) = extractNgramsT l h
403
404 instance HasText a => HasText (Node a)
405 where
406 hasText (Node _ _ _ _ _ _ _ h) = hasText h
407
408