]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Flow.hs
[CLEAN] refact clean WIP
[gargantext.git] / src / Gargantext / Database / Action / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -- TODO-ACCESS:
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
13
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
16 -}
17
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
19
20 {-# LANGUAGE ConstraintKinds #-}
21 {-# LANGUAGE ConstrainedClassMethods #-}
22 {-# LANGUAGE ConstraintKinds #-}
23 {-# LANGUAGE InstanceSigs #-}
24 {-# LANGUAGE TemplateHaskell #-}
25
26 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
27 ( FlowCmdM
28 , getDataText
29 , flowDataText
30
31 , flowCorpusFile
32 , flowCorpus
33 , flowAnnuaire
34
35 , getOrMkRoot
36 , getOrMk_RootWithCorpus
37 , TermType(..)
38 , DataOrigin(..)
39 , allDataOrigins
40
41 , do_api
42 )
43 where
44
45 import Control.Lens ((^.), view, _Just, makeLenses)
46 import Data.Aeson.TH (deriveJSON)
47 import Data.Either
48 import Data.List (concat)
49 import qualified Data.Map as Map
50 import Data.Map (Map, lookup)
51 import Data.Maybe (Maybe(..), catMaybes)
52 import Data.Monoid
53 import Data.Swagger
54 import Data.Text (splitOn, intercalate)
55 import Data.Traversable (traverse)
56 import Data.Tuple.Extra (first, second)
57 import GHC.Generics (Generic)
58 import System.FilePath (FilePath)
59
60 import Gargantext.Core (Lang(..))
61 import Gargantext.Core.Flow.Types
62 import Gargantext.Core.Types (Terms(..))
63 import Gargantext.Core.Types.Individu (User(..))
64 import Gargantext.Core.Types.Main
65 import Gargantext.Database.Action.Flow.List
66 import Gargantext.Database.Action.Flow.Types
67 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams)
68 import Gargantext.Database.Query.Table.Node
69 import Gargantext.Database.Query.Table.Node.Contact -- (HyperdataContact(..), ContactWho(..))
70 import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
71 import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
72 import Gargantext.Database.Action.Search (searchInDatabase)
73 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
74 import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
75 import Gargantext.Database.Admin.Types.Hyperdata
76 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
77 import Gargantext.Database.Prelude
78 import Gargantext.Database.Query.Table.Ngrams
79 import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
80 import Gargantext.Database.Query.Table.NodeNodeNgrams2
81 import Gargantext.Ext.IMT (toSchoolName)
82 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
83 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
84 import Gargantext.Text
85 import Gargantext.Prelude
86 import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat)
87 import Gargantext.Text.List (buildNgramsLists,StopSize(..))
88 import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
89 import Gargantext.Text.Terms
90 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
91 import qualified Gargantext.Text.Corpus.API as API
92
93 ------------------------------------------------------------------------
94 -- TODO use internal with API name (could be old data)
95 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
96 | ExternalOrigin { _do_api :: API.ExternalAPIs }
97 -- TODO Web
98 deriving (Generic, Eq)
99
100 makeLenses ''DataOrigin
101 deriveJSON (unPrefix "_do_") ''DataOrigin
102 instance ToSchema DataOrigin where
103 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
104
105 allDataOrigins :: [DataOrigin]
106 allDataOrigins = map InternalOrigin API.externalAPIs
107 <> map ExternalOrigin API.externalAPIs
108
109 ---------------
110
111 data DataText = DataOld ![NodeId]
112 | DataNew ![[HyperdataDocument]]
113
114
115 -- TODO use the split parameter in config file
116 getDataText :: FlowCmdM env err m
117 => DataOrigin
118 -> TermType Lang
119 -> API.Query
120 -> Maybe API.Limit
121 -> m DataText
122 getDataText (ExternalOrigin api) la q li = liftBase $ DataNew
123 <$> splitEvery 500
124 <$> API.get api (_tt_lang la) q li
125 getDataText (InternalOrigin _) _la q _li = do
126 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
127 (UserName userMaster)
128 (Left "")
129 (Nothing :: Maybe HyperdataCorpus)
130 ids <- map fst <$> searchInDatabase cId (stemIt q)
131 pure $ DataOld ids
132
133 -------------------------------------------------------------------------------
134 flowDataText :: FlowCmdM env err m
135 => User
136 -> DataText
137 -> TermType Lang
138 -> CorpusId
139 -> m CorpusId
140 flowDataText u (DataOld ids) tt cid = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids
141 where
142 corpusType = (Nothing :: Maybe HyperdataCorpus)
143 flowDataText u (DataNew txt) tt cid = flowCorpus u (Right [cid]) tt txt
144
145 ------------------------------------------------------------------------
146 -- TODO use proxy
147 flowAnnuaire :: FlowCmdM env err m
148 => User
149 -> Either CorpusName [CorpusId]
150 -> (TermType Lang)
151 -> FilePath
152 -> m AnnuaireId
153 flowAnnuaire u n l filePath = do
154 docs <- liftBase $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
155 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
156
157 ------------------------------------------------------------------------
158 flowCorpusFile :: FlowCmdM env err m
159 => User
160 -> Either CorpusName [CorpusId]
161 -> Limit -- Limit the number of docs (for dev purpose)
162 -> TermType Lang -> FileFormat -> FilePath
163 -> m CorpusId
164 flowCorpusFile u n l la ff fp = do
165 docs <- liftBase ( splitEvery 500
166 <$> take l
167 <$> parseFile ff fp
168 )
169 flowCorpus u n la (map (map toHyperdataDocument) docs)
170
171 ------------------------------------------------------------------------
172 -- | TODO improve the needed type to create/update a corpus
173 -- (For now, Either is enough)
174 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
175 => User
176 -> Either CorpusName [CorpusId]
177 -> TermType Lang
178 -> [[a]]
179 -> m CorpusId
180 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
181
182
183 flow :: (FlowCmdM env err m, FlowCorpus a, MkCorpus c)
184 => Maybe c
185 -> User
186 -> Either CorpusName [CorpusId]
187 -> TermType Lang
188 -> [[a]]
189 -> m CorpusId
190 flow c u cn la docs = do
191 ids <- traverse (insertMasterDocs c la) docs
192 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
193
194 ------------------------------------------------------------------------
195 flowCorpusUser :: (FlowCmdM env err m, MkCorpus c)
196 => Lang
197 -> User
198 -> Either CorpusName [CorpusId]
199 -> Maybe c
200 -> [NodeId]
201 -> m CorpusId
202 flowCorpusUser l user corpusName ctype ids = do
203 -- User Flow
204 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
205 listId <- getOrMkList userCorpusId userId
206 _cooc <- mkNode NodeListCooc listId userId
207 -- TODO: check if present already, ignore
208 _ <- Doc.add userCorpusId ids
209
210 _tId <- mkNode NodeTexts userCorpusId userId
211 -- printDebug "Node Text Id" tId
212
213 -- User List Flow
214 (masterUserId, _masterRootId, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
215 ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
216 _userListId <- flowList_DbRepo listId ngs
217 _mastListId <- getOrMkList masterCorpusId masterUserId
218 -- _ <- insertOccsUpdates userCorpusId mastListId
219 -- printDebug "userListId" userListId
220 -- User Graph Flow
221 _ <- mkDashboard userCorpusId userId
222 _ <- mkGraph userCorpusId userId
223 --_ <- mkPhylo userCorpusId userId
224
225 -- Annuaire Flow
226 -- _ <- mkAnnuaire rootUserId userId
227 pure userCorpusId
228
229
230 insertMasterDocs :: ( FlowCmdM env err m
231 , FlowCorpus a
232 , MkCorpus c
233 )
234 => Maybe c
235 -> TermType Lang
236 -> [a]
237 -> m [DocId]
238 insertMasterDocs c lang hs = do
239 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
240
241 -- TODO Type NodeDocumentUnicised
242 let docs = map addUniqId hs
243 ids <- insertDb masterUserId masterCorpusId docs
244 let
245 ids' = map reId ids
246 documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' docs)
247 -- TODO
248 -- create a corpus with database name (CSV or PubMed)
249 -- add documents to the corpus (create node_node link)
250 -- this will enable global database monitoring
251
252 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
253 maps <- mapNodeIdNgrams
254 <$> documentIdWithNgrams (extractNgramsT $ withLang lang documentsWithId) documentsWithId
255
256 terms2id <- insertNgrams $ Map.keys maps
257 -- to be removed
258 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) maps
259
260 -- new
261 lId <- getOrMkList masterCorpusId masterUserId
262 mapCgramsId <- listInsertDb lId toNodeNgramsW'
263 $ map (first _ngramsTerms . second Map.keys)
264 $ Map.toList maps
265 -- insertDocNgrams
266 _return <- insertNodeNodeNgrams2
267 $ catMaybes [ NodeNodeNgrams2 <$> Just nId
268 <*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms'')
269 <*> Just (fromIntegral w :: Double)
270 | (terms'', mapNgramsTypes) <- Map.toList maps
271 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
272 , (nId, w) <- Map.toList mapNodeIdWeight
273 ]
274
275 _ <- Doc.add masterCorpusId ids'
276 _cooc <- mkNode NodeListCooc lId masterUserId
277 -- to be removed
278 _ <- insertDocNgrams lId indexedNgrams
279
280 pure ids'
281
282
283 ------------------------------------------------------------------------
284
285
286
287 ------------------------------------------------------------------------
288 viewUniqId' :: UniqId a
289 => a
290 -> (HashId, a)
291 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
292 where
293 err = panic "[ERROR] Database.Flow.toInsert"
294
295
296 toInserted :: [ReturnId]
297 -> Map HashId ReturnId
298 toInserted =
299 Map.fromList . map (\r -> (reUniqId r, r) )
300 . filter (\r -> reInserted r == True)
301
302 mergeData :: Map HashId ReturnId
303 -> Map HashId a
304 -> [DocumentWithId a]
305 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
306 where
307 toDocumentWithId (sha,hpd) =
308 DocumentWithId <$> fmap reId (lookup sha rs)
309 <*> Just hpd
310
311 ------------------------------------------------------------------------
312
313 instance HasText HyperdataContact
314 where
315 hasText = undefined
316
317 ------------------------------------------------------------------------
318 ------------------------------------------------------------------------
319
320 documentIdWithNgrams :: HasNodeError err
321 => (a
322 -> Cmd err (Map Ngrams (Map NgramsType Int)))
323 -> [DocumentWithId a]
324 -> Cmd err [DocumentIdWithNgrams a]
325 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
326 where
327 toDocumentIdWithNgrams d = do
328 e <- f $ documentData d
329 pure $ DocumentIdWithNgrams d e
330
331
332 ------------------------------------------------------------------------
333
334
335 instance ExtractNgramsT HyperdataContact
336 where
337 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
338 where
339 extract :: TermType Lang -> HyperdataContact
340 -> Cmd err (Map Ngrams (Map NgramsType Int))
341 extract _l hc' = do
342 let authors = map text2ngrams
343 $ maybe ["Nothing"] (\a -> [a])
344 $ view (hc_who . _Just . cw_lastName) hc'
345
346 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
347
348 instance HasText HyperdataDocument
349 where
350 hasText h = catMaybes [ _hyperdataDocument_title h
351 , _hyperdataDocument_abstract h
352 ]
353
354 instance ExtractNgramsT HyperdataDocument
355 where
356 extractNgramsT :: TermType Lang
357 -> HyperdataDocument
358 -> Cmd err (Map Ngrams (Map NgramsType Int))
359 extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
360 where
361 extractNgramsT' :: TermType Lang
362 -> HyperdataDocument
363 -> Cmd err (Map Ngrams (Map NgramsType Int))
364 extractNgramsT' lang' doc = do
365 let source = text2ngrams
366 $ maybe "Nothing" identity
367 $ _hyperdataDocument_source doc
368
369 institutes = map text2ngrams
370 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
371 $ _hyperdataDocument_institutes doc
372
373 authors = map text2ngrams
374 $ maybe ["Nothing"] (splitOn ", ")
375 $ _hyperdataDocument_authors doc
376
377 terms' <- map text2ngrams
378 <$> map (intercalate " " . _terms_label)
379 <$> concat
380 <$> liftBase (extractTerms lang' $ hasText doc)
381
382 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
383 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
384 <> [(a', Map.singleton Authors 1) | a' <- authors ]
385 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
386
387