]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Flow.hs
[REFACT][FLOW] insertDocs fun
[gargantext.git] / src / Gargantext / Database / Action / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -- TODO-ACCESS:
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
13
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
16 -}
17
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
19
20 {-# LANGUAGE ConstraintKinds #-}
21 {-# LANGUAGE ConstrainedClassMethods #-}
22 {-# LANGUAGE ConstraintKinds #-}
23 {-# LANGUAGE InstanceSigs #-}
24 {-# LANGUAGE TemplateHaskell #-}
25
26 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
27 ( FlowCmdM
28 , getDataText
29 , flowDataText
30 , flow
31
32 , flowCorpusFile
33 , flowCorpus
34 , flowAnnuaire
35
36 , getOrMkRoot
37 , getOrMk_RootWithCorpus
38 , TermType(..)
39 , DataOrigin(..)
40 , allDataOrigins
41
42 , do_api
43 )
44 where
45
46 import Control.Lens ((^.), view, _Just, makeLenses)
47 import Data.Aeson.TH (deriveJSON)
48 import Data.Either
49 import Data.List (concat)
50 import qualified Data.Map as Map
51 import Data.Map (Map, lookup)
52 import Data.Maybe (Maybe(..), catMaybes)
53 import Data.Monoid
54 import Data.Swagger
55 import Data.Text (splitOn, intercalate)
56 import Data.Traversable (traverse)
57 import Data.Tuple.Extra (first, second)
58 import GHC.Generics (Generic)
59 import System.FilePath (FilePath)
60
61 import Gargantext.Core (Lang(..))
62 import Gargantext.Core.Flow.Types
63 import Gargantext.Core.Types (Terms(..))
64 import Gargantext.Core.Types.Individu (User(..))
65 import Gargantext.Core.Types.Main
66 import Gargantext.Database.Action.Flow.List
67 import Gargantext.Database.Action.Flow.Types
68 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams)
69 import Gargantext.Database.Query.Table.Node
70 import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
71 import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
72 import Gargantext.Database.Action.Search (searchDocInDatabase)
73 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
74 import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
75 import Gargantext.Database.Admin.Types.Hyperdata
76 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
77 import Gargantext.Database.Prelude
78 import Gargantext.Database.Query.Table.Ngrams
79 import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
80 import Gargantext.Database.Query.Table.NodeNodeNgrams2
81 import Gargantext.Core.Ext.IMT (toSchoolName)
82 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
83 import Gargantext.Core.Ext.IMTUser (deserialiseImtUsersFromFile)
84 import Gargantext.Core.Text
85 import Gargantext.Prelude
86 import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat)
87 import Gargantext.Core.Text.List (buildNgramsLists,StopSize(..))
88 import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
89 import Gargantext.Core.Text.Terms
90 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
91 import qualified Gargantext.Core.Text.Corpus.API as API
92
93 ------------------------------------------------------------------------
94 -- TODO use internal with API name (could be old data)
95 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
96 | ExternalOrigin { _do_api :: API.ExternalAPIs }
97 -- TODO Web
98 deriving (Generic, Eq)
99
100 makeLenses ''DataOrigin
101 deriveJSON (unPrefix "_do_") ''DataOrigin
102 instance ToSchema DataOrigin where
103 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
104
105 allDataOrigins :: [DataOrigin]
106 allDataOrigins = map InternalOrigin API.externalAPIs
107 <> map ExternalOrigin API.externalAPIs
108
109 ---------------
110 data DataText = DataOld ![NodeId]
111 | DataNew ![[HyperdataDocument]]
112
113 -- TODO use the split parameter in config file
114 getDataText :: FlowCmdM env err m
115 => DataOrigin
116 -> TermType Lang
117 -> API.Query
118 -> Maybe API.Limit
119 -> m DataText
120 getDataText (ExternalOrigin api) la q li = liftBase $ DataNew
121 <$> splitEvery 500
122 <$> API.get api (_tt_lang la) q li
123 getDataText (InternalOrigin _) _la q _li = do
124 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
125 (UserName userMaster)
126 (Left "")
127 (Nothing :: Maybe HyperdataCorpus)
128 ids <- map fst <$> searchDocInDatabase cId (stemIt q)
129 pure $ DataOld ids
130
131 -------------------------------------------------------------------------------
132 flowDataText :: FlowCmdM env err m
133 => User
134 -> DataText
135 -> TermType Lang
136 -> CorpusId
137 -> m CorpusId
138 flowDataText u (DataOld ids) tt cid = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids
139 where
140 corpusType = (Nothing :: Maybe HyperdataCorpus)
141 flowDataText u (DataNew txt) tt cid = flowCorpus u (Right [cid]) tt txt
142
143 ------------------------------------------------------------------------
144 -- TODO use proxy
145 flowAnnuaire :: FlowCmdM env err m
146 => User
147 -> Either CorpusName [CorpusId]
148 -> (TermType Lang)
149 -> FilePath
150 -> m AnnuaireId
151 flowAnnuaire u n l filePath = do
152 docs <- liftBase $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
153 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
154
155 ------------------------------------------------------------------------
156 flowCorpusFile :: FlowCmdM env err m
157 => User
158 -> Either CorpusName [CorpusId]
159 -> Limit -- Limit the number of docs (for dev purpose)
160 -> TermType Lang -> FileFormat -> FilePath
161 -> m CorpusId
162 flowCorpusFile u n l la ff fp = do
163 docs <- liftBase ( splitEvery 500
164 <$> take l
165 <$> parseFile ff fp
166 )
167 flowCorpus u n la (map (map toHyperdataDocument) docs)
168
169 ------------------------------------------------------------------------
170 -- | TODO improve the needed type to create/update a corpus
171 -- (For now, Either is enough)
172 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
173 => User
174 -> Either CorpusName [CorpusId]
175 -> TermType Lang
176 -> [[a]]
177 -> m CorpusId
178 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
179
180
181 flow :: (FlowCmdM env err m, FlowCorpus a, MkCorpus c)
182 => Maybe c
183 -> User
184 -> Either CorpusName [CorpusId]
185 -> TermType Lang
186 -> [[a]]
187 -> m CorpusId
188 flow c u cn la docs = do
189 -- TODO if public insertMasterDocs else insertUserDocs
190 ids <- traverse (insertMasterDocs c la) docs
191 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
192
193 ------------------------------------------------------------------------
194 flowCorpusUser :: (FlowCmdM env err m, MkCorpus c)
195 => Lang
196 -> User
197 -> Either CorpusName [CorpusId]
198 -> Maybe c
199 -> [NodeId]
200 -> m CorpusId
201 flowCorpusUser l user corpusName ctype ids = do
202 -- User Flow
203 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
204 listId <- getOrMkList userCorpusId userId
205 _cooc <- insertDefaultNode NodeListCooc listId userId
206 -- TODO: check if present already, ignore
207 _ <- Doc.add userCorpusId ids
208
209 _tId <- insertDefaultNode NodeTexts userCorpusId userId
210 -- printDebug "Node Text Id" tId
211
212 -- User List Flow
213 (masterUserId, _masterRootId, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
214 ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
215 _userListId <- flowList_DbRepo listId ngs
216 _mastListId <- getOrMkList masterCorpusId masterUserId
217 -- _ <- insertOccsUpdates userCorpusId mastListId
218 -- printDebug "userListId" userListId
219 -- User Graph Flow
220 _ <- insertDefaultNode NodeDashboard userCorpusId userId
221 _ <- insertDefaultNode NodeGraph userCorpusId userId
222 --_ <- mkPhylo userCorpusId userId
223
224 -- Annuaire Flow
225 -- _ <- mkAnnuaire rootUserId userId
226 pure userCorpusId
227
228 -- TODO Type NodeDocumentUnicised
229 insertDocs :: ( FlowCmdM env err m
230 , FlowCorpus a
231 )
232 => [a]
233 -> UserId
234 -> CorpusId
235 -> m ([DocId], [DocumentWithId a])
236 insertDocs hs uId cId = do
237 let docs = map addUniqId hs
238 ids <- insertDb uId cId docs
239 let
240 ids' = map reId ids
241 documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' docs)
242 _ <- Doc.add cId ids'
243 pure (ids', documentsWithId)
244
245
246 insertMasterDocs :: ( FlowCmdM env err m
247 , FlowCorpus a
248 , MkCorpus c
249 )
250 => Maybe c
251 -> TermType Lang
252 -> [a]
253 -> m [DocId]
254 insertMasterDocs c lang hs = do
255 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
256 (ids', documentsWithId) <- insertDocs hs masterUserId masterCorpusId
257
258 -- TODO
259 -- create a corpus with database name (CSV or PubMed)
260 -- add documents to the corpus (create node_node link)
261 -- this will enable global database monitoring
262
263 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
264 mapNgramsDocs <- mapNodeIdNgrams
265 <$> documentIdWithNgrams (extractNgramsT $ withLang lang documentsWithId) documentsWithId
266
267 terms2id <- insertNgrams $ Map.keys mapNgramsDocs
268 -- to be removed
269 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) mapNgramsDocs
270
271 -- new
272 lId <- getOrMkList masterCorpusId masterUserId
273 mapCgramsId <- listInsertDb lId toNodeNgramsW'
274 $ map (first _ngramsTerms . second Map.keys)
275 $ Map.toList mapNgramsDocs
276 -- insertDocNgrams
277 _return <- insertNodeNodeNgrams2
278 $ catMaybes [ NodeNodeNgrams2 <$> Just nId
279 <*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms'')
280 <*> Just (fromIntegral w :: Double)
281 | (terms'', mapNgramsTypes) <- Map.toList mapNgramsDocs
282 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
283 , (nId, w) <- Map.toList mapNodeIdWeight
284 ]
285
286 _cooc <- insertDefaultNode NodeListCooc lId masterUserId
287 -- to be removed
288 _ <- insertDocNgrams lId indexedNgrams
289 pure ids'
290
291 ------------------------------------------------------------------------
292 ------------------------------------------------------------------------
293 viewUniqId' :: UniqId a
294 => a
295 -> (HashId, a)
296 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
297 where
298 err = panic "[ERROR] Database.Flow.toInsert"
299
300
301 toInserted :: [ReturnId]
302 -> Map HashId ReturnId
303 toInserted =
304 Map.fromList . map (\r -> (reUniqId r, r) )
305 . filter (\r -> reInserted r == True)
306
307 mergeData :: Map HashId ReturnId
308 -> Map HashId a
309 -> [DocumentWithId a]
310 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
311 where
312 toDocumentWithId (sha,hpd) =
313 DocumentWithId <$> fmap reId (lookup sha rs)
314 <*> Just hpd
315
316 ------------------------------------------------------------------------
317 instance HasText HyperdataContact
318 where
319 hasText = undefined
320 ------------------------------------------------------------------------
321 ------------------------------------------------------------------------
322 documentIdWithNgrams :: HasNodeError err
323 => (a
324 -> Cmd err (Map Ngrams (Map NgramsType Int)))
325 -> [DocumentWithId a]
326 -> Cmd err [DocumentIdWithNgrams a]
327 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
328 where
329 toDocumentIdWithNgrams d = do
330 e <- f $ documentData d
331 pure $ DocumentIdWithNgrams d e
332
333 ------------------------------------------------------------------------
334 instance ExtractNgramsT HyperdataContact
335 where
336 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
337 where
338 extract :: TermType Lang -> HyperdataContact
339 -> Cmd err (Map Ngrams (Map NgramsType Int))
340 extract _l hc' = do
341 let authors = map text2ngrams
342 $ maybe ["Nothing"] (\a -> [a])
343 $ view (hc_who . _Just . cw_lastName) hc'
344
345 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
346
347 instance HasText HyperdataDocument
348 where
349 hasText h = catMaybes [ _hd_title h
350 , _hd_abstract h
351 ]
352
353 instance ExtractNgramsT HyperdataDocument
354 where
355 extractNgramsT :: TermType Lang
356 -> HyperdataDocument
357 -> Cmd err (Map Ngrams (Map NgramsType Int))
358 extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
359 where
360 extractNgramsT' :: TermType Lang
361 -> HyperdataDocument
362 -> Cmd err (Map Ngrams (Map NgramsType Int))
363 extractNgramsT' lang' doc = do
364 let source = text2ngrams
365 $ maybe "Nothing" identity
366 $ _hd_source doc
367
368 institutes = map text2ngrams
369 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
370 $ _hd_institutes doc
371
372 authors = map text2ngrams
373 $ maybe ["Nothing"] (splitOn ", ")
374 $ _hd_authors doc
375
376 terms' <- map text2ngrams
377 <$> map (intercalate " " . _terms_label)
378 <$> concat
379 <$> liftBase (extractTerms lang' $ hasText doc)
380
381 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
382 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
383 <> [(a', Map.singleton Authors 1) | a' <- authors ]
384 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
385
386