]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Flow.hs
[DB] clean and instance insertDB
[gargantext.git] / src / Gargantext / Database / Action / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -- TODO-ACCESS:
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
13
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
16 -}
17
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
19
20 {-# LANGUAGE ConstraintKinds #-}
21 {-# LANGUAGE ConstrainedClassMethods #-}
22 {-# LANGUAGE ConstraintKinds #-}
23 {-# LANGUAGE InstanceSigs #-}
24 {-# LANGUAGE TemplateHaskell #-}
25
26 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
27 ( FlowCmdM
28 , getDataText
29 , flowDataText
30
31 , flowCorpusFile
32 , flowCorpus
33 , flowAnnuaire
34
35 , getOrMkRoot
36 , getOrMk_RootWithCorpus
37 , TermType(..)
38 , DataOrigin(..)
39 , allDataOrigins
40
41 , do_api
42 )
43 where
44
45 import Control.Lens ((^.), view, _Just, makeLenses)
46 import Data.Aeson.TH (deriveJSON)
47 import Data.Either
48 import Data.List (concat)
49 import qualified Data.Map as Map
50 import Data.Map (Map, lookup)
51 import Data.Maybe (Maybe(..), catMaybes)
52 import Data.Monoid
53 import Data.Swagger
54 import Data.Text (splitOn, intercalate)
55 import Data.Traversable (traverse)
56 import Data.Tuple.Extra (first, second)
57 import GHC.Generics (Generic)
58 import System.FilePath (FilePath)
59
60 import Gargantext.Core (Lang(..))
61 import Gargantext.Core.Flow.Types
62 import Gargantext.Core.Types (Terms(..))
63 import Gargantext.Core.Types.Individu (User(..))
64 import Gargantext.Core.Types.Main
65 import Gargantext.Database.Action.Flow.List
66 import Gargantext.Database.Action.Flow.Types
67 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams)
68 import Gargantext.Database.Query.Table.Node
69 import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
70 import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
71 import Gargantext.Database.Action.Search (searchInDatabase)
72 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
73 import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
74 import Gargantext.Database.Admin.Types.Hyperdata
75 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
76 import Gargantext.Database.Prelude
77 import Gargantext.Database.Query.Table.Ngrams
78 import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
79 import Gargantext.Database.Query.Table.NodeNodeNgrams2
80 import Gargantext.Ext.IMT (toSchoolName)
81 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
82 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
83 import Gargantext.Text
84 import Gargantext.Prelude
85 import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat)
86 import Gargantext.Text.List (buildNgramsLists,StopSize(..))
87 import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
88 import Gargantext.Text.Terms
89 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
90 import qualified Gargantext.Text.Corpus.API as API
91
92 ------------------------------------------------------------------------
93 -- TODO use internal with API name (could be old data)
94 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
95 | ExternalOrigin { _do_api :: API.ExternalAPIs }
96 -- TODO Web
97 deriving (Generic, Eq)
98
99 makeLenses ''DataOrigin
100 deriveJSON (unPrefix "_do_") ''DataOrigin
101 instance ToSchema DataOrigin where
102 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
103
104 allDataOrigins :: [DataOrigin]
105 allDataOrigins = map InternalOrigin API.externalAPIs
106 <> map ExternalOrigin API.externalAPIs
107
108 ---------------
109
110 data DataText = DataOld ![NodeId]
111 | DataNew ![[HyperdataDocument]]
112
113
114 -- TODO use the split parameter in config file
115 getDataText :: FlowCmdM env err m
116 => DataOrigin
117 -> TermType Lang
118 -> API.Query
119 -> Maybe API.Limit
120 -> m DataText
121 getDataText (ExternalOrigin api) la q li = liftBase $ DataNew
122 <$> splitEvery 500
123 <$> API.get api (_tt_lang la) q li
124 getDataText (InternalOrigin _) _la q _li = do
125 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
126 (UserName userMaster)
127 (Left "")
128 (Nothing :: Maybe HyperdataCorpus)
129 ids <- map fst <$> searchInDatabase cId (stemIt q)
130 pure $ DataOld ids
131
132 -------------------------------------------------------------------------------
133 flowDataText :: FlowCmdM env err m
134 => User
135 -> DataText
136 -> TermType Lang
137 -> CorpusId
138 -> m CorpusId
139 flowDataText u (DataOld ids) tt cid = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids
140 where
141 corpusType = (Nothing :: Maybe HyperdataCorpus)
142 flowDataText u (DataNew txt) tt cid = flowCorpus u (Right [cid]) tt txt
143
144 ------------------------------------------------------------------------
145 -- TODO use proxy
146 flowAnnuaire :: FlowCmdM env err m
147 => User
148 -> Either CorpusName [CorpusId]
149 -> (TermType Lang)
150 -> FilePath
151 -> m AnnuaireId
152 flowAnnuaire u n l filePath = do
153 docs <- liftBase $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
154 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
155
156 ------------------------------------------------------------------------
157 flowCorpusFile :: FlowCmdM env err m
158 => User
159 -> Either CorpusName [CorpusId]
160 -> Limit -- Limit the number of docs (for dev purpose)
161 -> TermType Lang -> FileFormat -> FilePath
162 -> m CorpusId
163 flowCorpusFile u n l la ff fp = do
164 docs <- liftBase ( splitEvery 500
165 <$> take l
166 <$> parseFile ff fp
167 )
168 flowCorpus u n la (map (map toHyperdataDocument) docs)
169
170 ------------------------------------------------------------------------
171 -- | TODO improve the needed type to create/update a corpus
172 -- (For now, Either is enough)
173 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
174 => User
175 -> Either CorpusName [CorpusId]
176 -> TermType Lang
177 -> [[a]]
178 -> m CorpusId
179 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
180
181
182 flow :: (FlowCmdM env err m, FlowCorpus a, MkCorpus c)
183 => Maybe c
184 -> User
185 -> Either CorpusName [CorpusId]
186 -> TermType Lang
187 -> [[a]]
188 -> m CorpusId
189 flow c u cn la docs = do
190 ids <- traverse (insertMasterDocs c la) docs
191 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
192
193 ------------------------------------------------------------------------
194 flowCorpusUser :: (FlowCmdM env err m, MkCorpus c)
195 => Lang
196 -> User
197 -> Either CorpusName [CorpusId]
198 -> Maybe c
199 -> [NodeId]
200 -> m CorpusId
201 flowCorpusUser l user corpusName ctype ids = do
202 -- User Flow
203 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
204 listId <- getOrMkList userCorpusId userId
205 _cooc <- insertDefaultNode NodeListCooc listId userId
206 -- TODO: check if present already, ignore
207 _ <- Doc.add userCorpusId ids
208
209 _tId <- insertDefaultNode NodeTexts userCorpusId userId
210 -- printDebug "Node Text Id" tId
211
212 -- User List Flow
213 (masterUserId, _masterRootId, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
214 ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
215 _userListId <- flowList_DbRepo listId ngs
216 _mastListId <- getOrMkList masterCorpusId masterUserId
217 -- _ <- insertOccsUpdates userCorpusId mastListId
218 -- printDebug "userListId" userListId
219 -- User Graph Flow
220 _ <- insertDefaultNode NodeDashboard userCorpusId userId
221 _ <- insertDefaultNode NodeGraph userCorpusId userId
222 --_ <- mkPhylo userCorpusId userId
223
224 -- Annuaire Flow
225 -- _ <- mkAnnuaire rootUserId userId
226 pure userCorpusId
227
228
229 insertMasterDocs :: ( FlowCmdM env err m
230 , FlowCorpus a
231 , MkCorpus c
232 )
233 => Maybe c
234 -> TermType Lang
235 -> [a]
236 -> m [DocId]
237 insertMasterDocs c lang hs = do
238 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
239
240 -- TODO Type NodeDocumentUnicised
241 let docs = map addUniqId hs
242 ids <- insertDb masterUserId masterCorpusId docs
243 let
244 ids' = map reId ids
245 documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' docs)
246 -- TODO
247 -- create a corpus with database name (CSV or PubMed)
248 -- add documents to the corpus (create node_node link)
249 -- this will enable global database monitoring
250
251 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
252 maps <- mapNodeIdNgrams
253 <$> documentIdWithNgrams (extractNgramsT $ withLang lang documentsWithId) documentsWithId
254
255 terms2id <- insertNgrams $ Map.keys maps
256 -- to be removed
257 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) maps
258
259 -- new
260 lId <- getOrMkList masterCorpusId masterUserId
261 mapCgramsId <- listInsertDb lId toNodeNgramsW'
262 $ map (first _ngramsTerms . second Map.keys)
263 $ Map.toList maps
264 -- insertDocNgrams
265 _return <- insertNodeNodeNgrams2
266 $ catMaybes [ NodeNodeNgrams2 <$> Just nId
267 <*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms'')
268 <*> Just (fromIntegral w :: Double)
269 | (terms'', mapNgramsTypes) <- Map.toList maps
270 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
271 , (nId, w) <- Map.toList mapNodeIdWeight
272 ]
273
274 _ <- Doc.add masterCorpusId ids'
275 _cooc <- insertDefaultNode NodeListCooc lId masterUserId
276 -- to be removed
277 _ <- insertDocNgrams lId indexedNgrams
278
279 pure ids'
280
281
282 ------------------------------------------------------------------------
283
284
285
286 ------------------------------------------------------------------------
287 viewUniqId' :: UniqId a
288 => a
289 -> (HashId, a)
290 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
291 where
292 err = panic "[ERROR] Database.Flow.toInsert"
293
294
295 toInserted :: [ReturnId]
296 -> Map HashId ReturnId
297 toInserted =
298 Map.fromList . map (\r -> (reUniqId r, r) )
299 . filter (\r -> reInserted r == True)
300
301 mergeData :: Map HashId ReturnId
302 -> Map HashId a
303 -> [DocumentWithId a]
304 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
305 where
306 toDocumentWithId (sha,hpd) =
307 DocumentWithId <$> fmap reId (lookup sha rs)
308 <*> Just hpd
309
310 ------------------------------------------------------------------------
311
312 instance HasText HyperdataContact
313 where
314 hasText = undefined
315
316 ------------------------------------------------------------------------
317 ------------------------------------------------------------------------
318
319 documentIdWithNgrams :: HasNodeError err
320 => (a
321 -> Cmd err (Map Ngrams (Map NgramsType Int)))
322 -> [DocumentWithId a]
323 -> Cmd err [DocumentIdWithNgrams a]
324 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
325 where
326 toDocumentIdWithNgrams d = do
327 e <- f $ documentData d
328 pure $ DocumentIdWithNgrams d e
329
330
331 ------------------------------------------------------------------------
332
333
334 instance ExtractNgramsT HyperdataContact
335 where
336 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
337 where
338 extract :: TermType Lang -> HyperdataContact
339 -> Cmd err (Map Ngrams (Map NgramsType Int))
340 extract _l hc' = do
341 let authors = map text2ngrams
342 $ maybe ["Nothing"] (\a -> [a])
343 $ view (hc_who . _Just . cw_lastName) hc'
344
345 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
346
347 instance HasText HyperdataDocument
348 where
349 hasText h = catMaybes [ _hd_title h
350 , _hd_abstract h
351 ]
352
353 instance ExtractNgramsT HyperdataDocument
354 where
355 extractNgramsT :: TermType Lang
356 -> HyperdataDocument
357 -> Cmd err (Map Ngrams (Map NgramsType Int))
358 extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
359 where
360 extractNgramsT' :: TermType Lang
361 -> HyperdataDocument
362 -> Cmd err (Map Ngrams (Map NgramsType Int))
363 extractNgramsT' lang' doc = do
364 let source = text2ngrams
365 $ maybe "Nothing" identity
366 $ _hd_source doc
367
368 institutes = map text2ngrams
369 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
370 $ _hd_institutes doc
371
372 authors = map text2ngrams
373 $ maybe ["Nothing"] (splitOn ", ")
374 $ _hd_authors doc
375
376 terms' <- map text2ngrams
377 <$> map (intercalate " " . _terms_label)
378 <$> concat
379 <$> liftBase (extractTerms lang' $ hasText doc)
380
381 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
382 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
383 <> [(a', Map.singleton Authors 1) | a' <- authors ]
384 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
385
386