]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Flow.hs
[Social Lists] WIP
[gargantext.git] / src / Gargantext / Database / Action / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -- TODO-ACCESS:
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
13
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
16 -}
17
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
19
20 {-# LANGUAGE ConstraintKinds #-}
21 {-# LANGUAGE ConstrainedClassMethods #-}
22 {-# LANGUAGE ConstraintKinds #-}
23 {-# LANGUAGE InstanceSigs #-}
24 {-# LANGUAGE TemplateHaskell #-}
25
26 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
27 ( FlowCmdM
28 , getDataText
29 , flowDataText
30 , flow
31
32 , flowCorpusFile
33 , flowCorpus
34 , flowAnnuaire
35 , insertMasterDocs
36
37 , getOrMkRoot
38 , getOrMk_RootWithCorpus
39 , TermType(..)
40 , DataOrigin(..)
41 , allDataOrigins
42
43 , do_api
44 )
45 where
46
47 import Control.Lens ((^.), view, _Just, makeLenses)
48 import Data.Aeson.TH (deriveJSON)
49 import Data.Either
50 import Data.List (concat)
51 import qualified Data.Map as Map
52 import Data.Map (Map, lookup)
53 import Data.Maybe (catMaybes)
54 import Data.Monoid
55 import Data.Swagger
56 import Data.Text (splitOn, intercalate)
57 import Data.Traversable (traverse)
58 import Data.Tuple.Extra (first, second)
59 import GHC.Generics (Generic)
60 import System.FilePath (FilePath)
61
62 import Gargantext.Core (Lang(..))
63 import Gargantext.Core.Ext.IMT (toSchoolName)
64 import Gargantext.Core.Ext.IMTUser (deserialiseImtUsersFromFile)
65 import Gargantext.Core.Flow.Types
66 import Gargantext.Core.Text
67 import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat)
68 import Gargantext.Core.Text.List (buildNgramsLists,StopSize(..))
69 import Gargantext.Core.Text.Terms
70 import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
71 import Gargantext.Core.Types (Terms(..))
72 import Gargantext.Core.Types.Individu (User(..))
73 import Gargantext.Core.Types.Main
74 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
75 import Gargantext.Database.Action.Flow.List
76 import Gargantext.Database.Action.Flow.Types
77 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams)
78 import Gargantext.Database.Action.Search (searchDocInDatabase)
79 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
80 import Gargantext.Database.Admin.Types.Hyperdata
81 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
82 import Gargantext.Database.Prelude
83 import Gargantext.Database.Query.Table.Ngrams
84 import Gargantext.Database.Query.Table.Node
85 import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
86 import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
87 import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
88 import Gargantext.Database.Query.Table.NodeNodeNgrams2
89 import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
90 import Gargantext.Database.Schema.Node (NodePoly(..))
91 import Gargantext.Prelude
92 import Gargantext.Prelude.Crypto.Hash (Hash)
93 import qualified Gargantext.Core.Text.Corpus.API as API
94 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
95
96 ------------------------------------------------------------------------
97 -- TODO use internal with API name (could be old data)
98 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
99 | ExternalOrigin { _do_api :: API.ExternalAPIs }
100 -- TODO Web
101 deriving (Generic, Eq)
102
103 makeLenses ''DataOrigin
104 deriveJSON (unPrefix "_do_") ''DataOrigin
105 instance ToSchema DataOrigin where
106 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
107
108 allDataOrigins :: [DataOrigin]
109 allDataOrigins = map InternalOrigin API.externalAPIs
110 <> map ExternalOrigin API.externalAPIs
111
112 ---------------
113 data DataText = DataOld ![NodeId]
114 | DataNew ![[HyperdataDocument]]
115
116 -- TODO use the split parameter in config file
117 getDataText :: FlowCmdM env err m
118 => DataOrigin
119 -> TermType Lang
120 -> API.Query
121 -> Maybe API.Limit
122 -> m DataText
123 getDataText (ExternalOrigin api) la q li = liftBase $ DataNew
124 <$> splitEvery 500
125 <$> API.get api (_tt_lang la) q li
126 getDataText (InternalOrigin _) _la q _li = do
127 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
128 (UserName userMaster)
129 (Left "")
130 (Nothing :: Maybe HyperdataCorpus)
131 ids <- map fst <$> searchDocInDatabase cId (stemIt q)
132 pure $ DataOld ids
133
134 -------------------------------------------------------------------------------
135 flowDataText :: ( FlowCmdM env err m
136 )
137 => User
138 -> DataText
139 -> TermType Lang
140 -> CorpusId
141 -> m CorpusId
142 flowDataText u (DataOld ids) tt cid = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids
143 where
144 corpusType = (Nothing :: Maybe HyperdataCorpus)
145 flowDataText u (DataNew txt) tt cid = flowCorpus u (Right [cid]) tt txt
146
147 ------------------------------------------------------------------------
148 -- TODO use proxy
149 flowAnnuaire :: (FlowCmdM env err m)
150 => User
151 -> Either CorpusName [CorpusId]
152 -> (TermType Lang)
153 -> FilePath
154 -> m AnnuaireId
155 flowAnnuaire u n l filePath = do
156 docs <- liftBase $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
157 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
158
159 ------------------------------------------------------------------------
160 flowCorpusFile :: (FlowCmdM env err m)
161 => User
162 -> Either CorpusName [CorpusId]
163 -> Limit -- Limit the number of docs (for dev purpose)
164 -> TermType Lang -> FileFormat -> FilePath
165 -> m CorpusId
166 flowCorpusFile u n l la ff fp = do
167 docs <- liftBase ( splitEvery 500
168 <$> take l
169 <$> parseFile ff fp
170 )
171 flowCorpus u n la (map (map toHyperdataDocument) docs)
172
173 ------------------------------------------------------------------------
174 -- | TODO improve the needed type to create/update a corpus
175 -- (For now, Either is enough)
176 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
177 => User
178 -> Either CorpusName [CorpusId]
179 -> TermType Lang
180 -> [[a]]
181 -> m CorpusId
182 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
183
184
185 flow :: ( FlowCmdM env err m
186 , FlowCorpus a
187 , MkCorpus c
188 )
189 => Maybe c
190 -> User
191 -> Either CorpusName [CorpusId]
192 -> TermType Lang
193 -> [[a]]
194 -> m CorpusId
195 flow c u cn la docs = do
196 -- TODO if public insertMasterDocs else insertUserDocs
197 ids <- traverse (insertMasterDocs c la) docs
198 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
199
200 ------------------------------------------------------------------------
201 flowCorpusUser :: ( FlowCmdM env err m
202 , MkCorpus c
203 )
204 => Lang
205 -> User
206 -> Either CorpusName [CorpusId]
207 -> Maybe c
208 -> [NodeId]
209 -> m CorpusId
210 flowCorpusUser l user corpusName ctype ids = do
211 -- User Flow
212 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
213 listId <- getOrMkList userCorpusId userId
214 -- _cooc <- insertDefaultNode NodeListCooc listId userId
215 -- TODO: check if present already, ignore
216 _ <- Doc.add userCorpusId ids
217
218 _tId <- insertDefaultNode NodeTexts userCorpusId userId
219 -- printDebug "Node Text Ids:" tId
220
221 -- User List Flow
222 (masterUserId, _masterRootId, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
223 ngs <- buildNgramsLists user l 2 3 (StopSize 3) userCorpusId masterCorpusId
224 _userListId <- flowList_DbRepo listId ngs
225 _mastListId <- getOrMkList masterCorpusId masterUserId
226 -- _ <- insertOccsUpdates userCorpusId mastListId
227 -- printDebug "userListId" userListId
228 -- User Graph Flow
229 _ <- insertDefaultNode NodeDashboard userCorpusId userId
230 _ <- insertDefaultNode NodeGraph userCorpusId userId
231 --_ <- mkPhylo userCorpusId userId
232 -- Annuaire Flow
233 -- _ <- mkAnnuaire rootUserId userId
234 pure userCorpusId
235
236
237 insertMasterDocs :: ( FlowCmdM env err m
238 , FlowCorpus a
239 , MkCorpus c
240 )
241 => Maybe c
242 -> TermType Lang
243 -> [a]
244 -> m [DocId]
245 insertMasterDocs c lang hs = do
246 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
247 (ids', documentsWithId) <- insertDocs masterUserId masterCorpusId (map (toNode masterUserId masterCorpusId) hs )
248 _ <- Doc.add masterCorpusId ids'
249 -- TODO
250 -- create a corpus with database name (CSV or PubMed)
251 -- add documents to the corpus (create node_node link)
252 -- this will enable global database monitoring
253
254 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
255 mapNgramsDocs <- mapNodeIdNgrams
256 <$> documentIdWithNgrams (extractNgramsT $ withLang lang documentsWithId) documentsWithId
257
258 terms2id <- insertNgrams $ Map.keys mapNgramsDocs
259 -- to be removed
260 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) mapNgramsDocs
261
262 -- new
263 lId <- getOrMkList masterCorpusId masterUserId
264 mapCgramsId <- listInsertDb lId toNodeNgramsW'
265 $ map (first _ngramsTerms . second Map.keys)
266 $ Map.toList mapNgramsDocs
267 -- insertDocNgrams
268 _return <- insertNodeNodeNgrams2
269 $ catMaybes [ NodeNodeNgrams2 <$> Just nId
270 <*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms'')
271 <*> Just (fromIntegral w :: Double)
272 | (terms'', mapNgramsTypes) <- Map.toList mapNgramsDocs
273 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
274 , (nId, w) <- Map.toList mapNodeIdWeight
275 ]
276
277 -- _cooc <- insertDefaultNode NodeListCooc lId masterUserId
278 -- to be removed
279 _ <- insertDocNgrams lId indexedNgrams
280 pure ids'
281
282 ------------------------------------------------------------------------
283 -- TODO Type NodeDocumentUnicised
284 insertDocs :: ( FlowCmdM env err m
285 -- , FlowCorpus a
286 , FlowInsertDB a
287 )
288 => UserId
289 -> CorpusId
290 -> [a]
291 -> m ([DocId], [DocumentWithId a])
292 insertDocs uId cId hs = do
293 let docs = map addUniqId hs
294 newIds <- insertDb uId cId docs
295 -- printDebug "newIds" newIds
296 let
297 newIds' = map reId newIds
298 documentsWithId = mergeData (toInserted newIds) (Map.fromList $ map viewUniqId' docs)
299 _ <- Doc.add cId newIds'
300 pure (newIds', documentsWithId)
301
302
303
304 ------------------------------------------------------------------------
305 viewUniqId' :: UniqId a
306 => a
307 -> (Hash, a)
308 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
309 where
310 err = panic "[ERROR] Database.Flow.toInsert"
311
312
313 toInserted :: [ReturnId]
314 -> Map Hash ReturnId
315 toInserted =
316 Map.fromList . map (\r -> (reUniqId r, r) )
317 . filter (\r -> reInserted r == True)
318
319 mergeData :: Map Hash ReturnId
320 -> Map Hash a
321 -> [DocumentWithId a]
322 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
323 where
324 toDocumentWithId (sha,hpd) =
325 DocumentWithId <$> fmap reId (lookup sha rs)
326 <*> Just hpd
327
328 ------------------------------------------------------------------------
329 instance HasText HyperdataContact
330 where
331 hasText = undefined
332 ------------------------------------------------------------------------
333 ------------------------------------------------------------------------
334 documentIdWithNgrams :: HasNodeError err
335 => (a
336 -> Cmd err (Map Ngrams (Map NgramsType Int)))
337 -> [DocumentWithId a]
338 -> Cmd err [DocumentIdWithNgrams a]
339 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
340 where
341 toDocumentIdWithNgrams d = do
342 e <- f $ documentData d
343 pure $ DocumentIdWithNgrams d e
344
345 ------------------------------------------------------------------------
346 instance ExtractNgramsT HyperdataContact
347 where
348 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
349 where
350 extract :: TermType Lang -> HyperdataContact
351 -> Cmd err (Map Ngrams (Map NgramsType Int))
352 extract _l hc' = do
353 let authors = map text2ngrams
354 $ maybe ["Nothing"] (\a -> [a])
355 $ view (hc_who . _Just . cw_lastName) hc'
356
357 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
358
359 instance HasText HyperdataDocument
360 where
361 hasText h = catMaybes [ _hd_title h
362 , _hd_abstract h
363 ]
364
365
366 instance ExtractNgramsT HyperdataDocument
367 where
368 extractNgramsT :: TermType Lang
369 -> HyperdataDocument
370 -> Cmd err (Map Ngrams (Map NgramsType Int))
371 extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
372 where
373 extractNgramsT' :: TermType Lang
374 -> HyperdataDocument
375 -> Cmd err (Map Ngrams (Map NgramsType Int))
376 extractNgramsT' lang' doc = do
377 let source = text2ngrams
378 $ maybe "Nothing" identity
379 $ _hd_source doc
380
381 institutes = map text2ngrams
382 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
383 $ _hd_institutes doc
384
385 authors = map text2ngrams
386 $ maybe ["Nothing"] (splitOn ", ")
387 $ _hd_authors doc
388
389 terms' <- map text2ngrams
390 <$> map (intercalate " " . _terms_label)
391 <$> concat
392 <$> liftBase (extractTerms lang' $ hasText doc)
393
394 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
395 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
396 <> [(a', Map.singleton Authors 1) | a' <- authors ]
397 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
398
399 instance (ExtractNgramsT a, HasText a) => ExtractNgramsT (Node a)
400 where
401 extractNgramsT l (Node _ _ _ _ _ _ _ h) = extractNgramsT l h
402
403 instance HasText a => HasText (Node a)
404 where
405 hasText (Node _ _ _ _ _ _ _ h) = hasText h
406
407