]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Flow.hs
[DB/FACT] Schema NodeNodeNgrams -> Query (with warnings)
[gargantext.git] / src / Gargantext / Database / Action / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -- TODO-ACCESS:
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
13
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
16 -}
17
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
19
20 {-# LANGUAGE ConstraintKinds #-}
21 {-# LANGUAGE RankNTypes #-}
22 {-# LANGUAGE ConstrainedClassMethods #-}
23 {-# LANGUAGE ConstraintKinds #-}
24 {-# LANGUAGE DeriveGeneric #-}
25 {-# LANGUAGE FlexibleContexts #-}
26 {-# LANGUAGE InstanceSigs #-}
27 {-# LANGUAGE NoImplicitPrelude #-}
28 {-# LANGUAGE OverloadedStrings #-}
29 {-# LANGUAGE TemplateHaskell #-}
30
31 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
32 ( FlowCmdM
33 , getDataText
34 , flowDataText
35
36 , flowCorpusFile
37 , flowCorpus
38 , flowAnnuaire
39
40 , getOrMkRoot
41 , getOrMk_RootWithCorpus
42 , TermType(..)
43 , DataOrigin(..)
44 , allDataOrigins
45
46 , do_api
47 )
48 where
49
50 import Control.Lens ((^.), view, _Just, makeLenses)
51 import Data.Aeson.TH (deriveJSON)
52 import Data.Either
53 import Data.List (concat)
54 import Data.Map (Map, lookup)
55 import Data.Maybe (Maybe(..), catMaybes)
56 import Data.Monoid
57 import Data.Swagger
58 import Data.Text (splitOn, intercalate)
59 import Data.Traversable (traverse)
60 import Data.Tuple.Extra (first, second)
61 import Gargantext.Core (Lang(..))
62 import Gargantext.Core.Flow.Types
63 import Gargantext.Core.Types (Terms(..))
64 import Gargantext.Core.Types.Individu (User(..))
65 import Gargantext.Core.Types.Main
66 import Gargantext.Database.Action.Flow.List
67 import Gargantext.Database.Action.Flow.Types
68 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams)
69 import Gargantext.Database.Query.Table.Node
70 import Gargantext.Database.Query.Table.Node.Contact -- (HyperdataContact(..), ContactWho(..))
71 import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
72 import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
73 import Gargantext.Database.Action.Search (searchInDatabase)
74 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
75 import Gargantext.Database.Admin.Types.Errors (HasNodeError(..))
76 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
77 import Gargantext.Database.Admin.Utils (Cmd)
78 import Gargantext.Database.Query.Table.Ngrams
79 import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
80 import Gargantext.Database.Query.Table.NodeNodeNgrams2
81 import Gargantext.Ext.IMT (toSchoolName)
82 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
83 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
84 import Gargantext.Text
85 import Gargantext.Prelude
86 import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat)
87 import Gargantext.Text.List (buildNgramsLists,StopSize(..))
88 import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
89 import Gargantext.Text.Terms
90 import GHC.Generics (Generic)
91 import System.FilePath (FilePath)
92 import qualified Data.Map as Map
93 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
94 import qualified Gargantext.Text.Corpus.API as API
95
96 ------------------------------------------------------------------------
97 -- TODO use internal with API name (could be old data)
98 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
99 | ExternalOrigin { _do_api :: API.ExternalAPIs }
100 -- TODO Web
101 deriving (Generic, Eq)
102
103 makeLenses ''DataOrigin
104 deriveJSON (unPrefix "_do_") ''DataOrigin
105 instance ToSchema DataOrigin where
106 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
107
108 allDataOrigins :: [DataOrigin]
109 allDataOrigins = map InternalOrigin API.externalAPIs
110 <> map ExternalOrigin API.externalAPIs
111
112 ---------------
113
114 data DataText = DataOld ![NodeId]
115 | DataNew ![[HyperdataDocument]]
116
117
118 -- TODO use the split parameter in config file
119 getDataText :: FlowCmdM env err m
120 => DataOrigin
121 -> TermType Lang
122 -> API.Query
123 -> Maybe API.Limit
124 -> m DataText
125 getDataText (ExternalOrigin api) la q li = liftBase $ DataNew
126 <$> splitEvery 500
127 <$> API.get api (_tt_lang la) q li
128 getDataText (InternalOrigin _) _la q _li = do
129 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
130 (UserName userMaster)
131 (Left "")
132 (Nothing :: Maybe HyperdataCorpus)
133 ids <- map fst <$> searchInDatabase cId (stemIt q)
134 pure $ DataOld ids
135
136 -------------------------------------------------------------------------------
137 flowDataText :: FlowCmdM env err m
138 => User
139 -> DataText
140 -> TermType Lang
141 -> CorpusId
142 -> m CorpusId
143 flowDataText u (DataOld ids) tt cid = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids
144 where
145 corpusType = (Nothing :: Maybe HyperdataCorpus)
146 flowDataText u (DataNew txt) tt cid = flowCorpus u (Right [cid]) tt txt
147
148 ------------------------------------------------------------------------
149 -- TODO use proxy
150 flowAnnuaire :: FlowCmdM env err m
151 => User
152 -> Either CorpusName [CorpusId]
153 -> (TermType Lang)
154 -> FilePath
155 -> m AnnuaireId
156 flowAnnuaire u n l filePath = do
157 docs <- liftBase $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
158 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
159
160 ------------------------------------------------------------------------
161 flowCorpusFile :: FlowCmdM env err m
162 => User
163 -> Either CorpusName [CorpusId]
164 -> Limit -- Limit the number of docs (for dev purpose)
165 -> TermType Lang -> FileFormat -> FilePath
166 -> m CorpusId
167 flowCorpusFile u n l la ff fp = do
168 docs <- liftBase ( splitEvery 500
169 <$> take l
170 <$> parseFile ff fp
171 )
172 flowCorpus u n la (map (map toHyperdataDocument) docs)
173
174 ------------------------------------------------------------------------
175 -- | TODO improve the needed type to create/update a corpus
176 -- (For now, Either is enough)
177 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
178 => User
179 -> Either CorpusName [CorpusId]
180 -> TermType Lang
181 -> [[a]]
182 -> m CorpusId
183 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
184
185
186 flow :: (FlowCmdM env err m, FlowCorpus a, MkCorpus c)
187 => Maybe c
188 -> User
189 -> Either CorpusName [CorpusId]
190 -> TermType Lang
191 -> [[a]]
192 -> m CorpusId
193 flow c u cn la docs = do
194 ids <- traverse (insertMasterDocs c la) docs
195 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
196
197 ------------------------------------------------------------------------
198 flowCorpusUser :: (FlowCmdM env err m, MkCorpus c)
199 => Lang
200 -> User
201 -> Either CorpusName [CorpusId]
202 -> Maybe c
203 -> [NodeId]
204 -> m CorpusId
205 flowCorpusUser l user corpusName ctype ids = do
206 -- User Flow
207 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
208 listId <- getOrMkList userCorpusId userId
209 _cooc <- mkNode NodeListCooc listId userId
210 -- TODO: check if present already, ignore
211 _ <- Doc.add userCorpusId ids
212
213 _tId <- mkNode NodeTexts userCorpusId userId
214 -- printDebug "Node Text Id" tId
215
216 -- User List Flow
217 (masterUserId, _masterRootId, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
218 ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
219 _userListId <- flowList_DbRepo listId ngs
220 _mastListId <- getOrMkList masterCorpusId masterUserId
221 -- _ <- insertOccsUpdates userCorpusId mastListId
222 -- printDebug "userListId" userListId
223 -- User Graph Flow
224 _ <- mkDashboard userCorpusId userId
225 _ <- mkGraph userCorpusId userId
226 --_ <- mkPhylo userCorpusId userId
227
228 -- Annuaire Flow
229 -- _ <- mkAnnuaire rootUserId userId
230 pure userCorpusId
231
232
233 insertMasterDocs :: ( FlowCmdM env err m
234 , FlowCorpus a
235 , MkCorpus c
236 )
237 => Maybe c
238 -> TermType Lang
239 -> [a]
240 -> m [DocId]
241 insertMasterDocs c lang hs = do
242 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
243
244 -- TODO Type NodeDocumentUnicised
245 let docs = map addUniqId hs
246 ids <- insertDb masterUserId masterCorpusId docs
247 let
248 ids' = map reId ids
249 documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' docs)
250 -- TODO
251 -- create a corpus with database name (CSV or PubMed)
252 -- add documents to the corpus (create node_node link)
253 -- this will enable global database monitoring
254
255 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
256 maps <- mapNodeIdNgrams
257 <$> documentIdWithNgrams (extractNgramsT $ withLang lang documentsWithId) documentsWithId
258
259 terms2id <- insertNgrams $ Map.keys maps
260 -- to be removed
261 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) maps
262
263 -- new
264 lId <- getOrMkList masterCorpusId masterUserId
265 mapCgramsId <- listInsertDb lId toNodeNgramsW'
266 $ map (first _ngramsTerms . second Map.keys)
267 $ Map.toList maps
268 -- insertDocNgrams
269 _return <- insertNodeNodeNgrams2
270 $ catMaybes [ NodeNodeNgrams2 <$> Just nId
271 <*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms'')
272 <*> Just (fromIntegral w :: Double)
273 | (terms'', mapNgramsTypes) <- Map.toList maps
274 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
275 , (nId, w) <- Map.toList mapNodeIdWeight
276 ]
277
278 _ <- Doc.add masterCorpusId ids'
279 _cooc <- mkNode NodeListCooc lId masterUserId
280 -- to be removed
281 _ <- insertDocNgrams lId indexedNgrams
282
283 pure ids'
284
285
286 ------------------------------------------------------------------------
287
288
289
290 ------------------------------------------------------------------------
291 viewUniqId' :: UniqId a
292 => a
293 -> (HashId, a)
294 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
295 where
296 err = panic "[ERROR] Database.Flow.toInsert"
297
298
299 toInserted :: [ReturnId]
300 -> Map HashId ReturnId
301 toInserted =
302 Map.fromList . map (\r -> (reUniqId r, r) )
303 . filter (\r -> reInserted r == True)
304
305 mergeData :: Map HashId ReturnId
306 -> Map HashId a
307 -> [DocumentWithId a]
308 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
309 where
310 toDocumentWithId (sha,hpd) =
311 DocumentWithId <$> fmap reId (lookup sha rs)
312 <*> Just hpd
313
314 ------------------------------------------------------------------------
315
316 instance HasText HyperdataContact
317 where
318 hasText = undefined
319
320 ------------------------------------------------------------------------
321 ------------------------------------------------------------------------
322
323 documentIdWithNgrams :: HasNodeError err
324 => (a
325 -> Cmd err (Map Ngrams (Map NgramsType Int)))
326 -> [DocumentWithId a]
327 -> Cmd err [DocumentIdWithNgrams a]
328 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
329 where
330 toDocumentIdWithNgrams d = do
331 e <- f $ documentData d
332 pure $ DocumentIdWithNgrams d e
333
334
335 ------------------------------------------------------------------------
336
337
338 instance ExtractNgramsT HyperdataContact
339 where
340 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
341 where
342 extract :: TermType Lang -> HyperdataContact
343 -> Cmd err (Map Ngrams (Map NgramsType Int))
344 extract _l hc' = do
345 let authors = map text2ngrams
346 $ maybe ["Nothing"] (\a -> [a])
347 $ view (hc_who . _Just . cw_lastName) hc'
348
349 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
350
351 instance HasText HyperdataDocument
352 where
353 hasText h = catMaybes [ _hyperdataDocument_title h
354 , _hyperdataDocument_abstract h
355 ]
356
357 instance ExtractNgramsT HyperdataDocument
358 where
359 extractNgramsT :: TermType Lang
360 -> HyperdataDocument
361 -> Cmd err (Map Ngrams (Map NgramsType Int))
362 extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
363 where
364 extractNgramsT' :: TermType Lang
365 -> HyperdataDocument
366 -> Cmd err (Map Ngrams (Map NgramsType Int))
367 extractNgramsT' lang' doc = do
368 let source = text2ngrams
369 $ maybe "Nothing" identity
370 $ _hyperdataDocument_source doc
371
372 institutes = map text2ngrams
373 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
374 $ _hyperdataDocument_institutes doc
375
376 authors = map text2ngrams
377 $ maybe ["Nothing"] (splitOn ", ")
378 $ _hyperdataDocument_authors doc
379
380 terms' <- map text2ngrams
381 <$> map (intercalate " " . _terms_label)
382 <$> concat
383 <$> liftBase (extractTerms lang' $ hasText doc)
384
385 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
386 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
387 <> [(a', Map.singleton Authors 1) | a' <- authors ]
388 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
389
390