]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Flow.hs
Merge branch 'dev-doc-annotation-issue' of https://gitlab.iscpif.fr/gargantext/haskel...
[gargantext.git] / src / Gargantext / Database / Action / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -- TODO-ACCESS:
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
13
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
16 -}
17
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
19
20 {-# LANGUAGE ConstraintKinds #-}
21 {-# LANGUAGE ConstrainedClassMethods #-}
22 {-# LANGUAGE ConstraintKinds #-}
23 {-# LANGUAGE InstanceSigs #-}
24 {-# LANGUAGE TemplateHaskell #-}
25
26 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
27 ( FlowCmdM
28 , getDataText
29 , flowDataText
30 , flow
31
32 , flowCorpusFile
33 , flowCorpus
34 , flowAnnuaire
35
36 , getOrMkRoot
37 , getOrMk_RootWithCorpus
38 , TermType(..)
39 , DataOrigin(..)
40 , allDataOrigins
41
42 , do_api
43 )
44 where
45
46 import Control.Lens ((^.), view, _Just, makeLenses)
47 import Data.Aeson.TH (deriveJSON)
48 import Data.Either
49 import Data.List (concat)
50 import qualified Data.Map as Map
51 import Data.Map (Map, lookup)
52 import Data.Maybe (Maybe(..), catMaybes)
53 import Data.Monoid
54 import Data.Swagger
55 import Data.Text (splitOn, intercalate)
56 import Data.Traversable (traverse)
57 import Data.Tuple.Extra (first, second)
58 import GHC.Generics (Generic)
59 import System.FilePath (FilePath)
60
61 import Gargantext.Core (Lang(..))
62 import Gargantext.Core.Flow.Types
63 import Gargantext.Core.Types (Terms(..))
64 import Gargantext.Core.Types.Individu (User(..))
65 import Gargantext.Core.Types.Main
66 import Gargantext.Database.Action.Flow.List
67 import Gargantext.Database.Action.Flow.Types
68 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams)
69 import Gargantext.Database.Query.Table.Node
70 import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
71 import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
72 import Gargantext.Database.Action.Search (searchDocInDatabase)
73 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
74 import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
75 import Gargantext.Database.Admin.Types.Hyperdata
76 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
77 import Gargantext.Database.Prelude
78 import Gargantext.Database.Query.Table.Ngrams
79 import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
80 import Gargantext.Database.Query.Table.NodeNodeNgrams2
81 import Gargantext.Ext.IMT (toSchoolName)
82 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
83 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
84 import Gargantext.Text
85 import Gargantext.Prelude
86 import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat)
87 import Gargantext.Text.List (buildNgramsLists,StopSize(..))
88 import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
89 import Gargantext.Text.Terms
90 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
91 import qualified Gargantext.Text.Corpus.API as API
92
93 ------------------------------------------------------------------------
94 -- TODO use internal with API name (could be old data)
95 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
96 | ExternalOrigin { _do_api :: API.ExternalAPIs }
97 -- TODO Web
98 deriving (Generic, Eq)
99
100 makeLenses ''DataOrigin
101 deriveJSON (unPrefix "_do_") ''DataOrigin
102 instance ToSchema DataOrigin where
103 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
104
105 allDataOrigins :: [DataOrigin]
106 allDataOrigins = map InternalOrigin API.externalAPIs
107 <> map ExternalOrigin API.externalAPIs
108
109 ---------------
110 data DataText = DataOld ![NodeId]
111 | DataNew ![[HyperdataDocument]]
112
113 -- TODO use the split parameter in config file
114 getDataText :: FlowCmdM env err m
115 => DataOrigin
116 -> TermType Lang
117 -> API.Query
118 -> Maybe API.Limit
119 -> m DataText
120 getDataText (ExternalOrigin api) la q li = liftBase $ DataNew
121 <$> splitEvery 500
122 <$> API.get api (_tt_lang la) q li
123 getDataText (InternalOrigin _) _la q _li = do
124 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
125 (UserName userMaster)
126 (Left "")
127 (Nothing :: Maybe HyperdataCorpus)
128 ids <- map fst <$> searchDocInDatabase cId (stemIt q)
129 pure $ DataOld ids
130
131 -------------------------------------------------------------------------------
132 flowDataText :: FlowCmdM env err m
133 => User
134 -> DataText
135 -> TermType Lang
136 -> CorpusId
137 -> m CorpusId
138 flowDataText u (DataOld ids) tt cid = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids
139 where
140 corpusType = (Nothing :: Maybe HyperdataCorpus)
141 flowDataText u (DataNew txt) tt cid = flowCorpus u (Right [cid]) tt txt
142
143 ------------------------------------------------------------------------
144 -- TODO use proxy
145 flowAnnuaire :: FlowCmdM env err m
146 => User
147 -> Either CorpusName [CorpusId]
148 -> (TermType Lang)
149 -> FilePath
150 -> m AnnuaireId
151 flowAnnuaire u n l filePath = do
152 docs <- liftBase $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
153 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
154
155 ------------------------------------------------------------------------
156 flowCorpusFile :: FlowCmdM env err m
157 => User
158 -> Either CorpusName [CorpusId]
159 -> Limit -- Limit the number of docs (for dev purpose)
160 -> TermType Lang -> FileFormat -> FilePath
161 -> m CorpusId
162 flowCorpusFile u n l la ff fp = do
163 docs <- liftBase ( splitEvery 500
164 <$> take l
165 <$> parseFile ff fp
166 )
167 flowCorpus u n la (map (map toHyperdataDocument) docs)
168
169 ------------------------------------------------------------------------
170 -- | TODO improve the needed type to create/update a corpus
171 -- (For now, Either is enough)
172 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
173 => User
174 -> Either CorpusName [CorpusId]
175 -> TermType Lang
176 -> [[a]]
177 -> m CorpusId
178 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
179
180
181 flow :: (FlowCmdM env err m, FlowCorpus a, MkCorpus c)
182 => Maybe c
183 -> User
184 -> Either CorpusName [CorpusId]
185 -> TermType Lang
186 -> [[a]]
187 -> m CorpusId
188 flow c u cn la docs = do
189 ids <- traverse (insertMasterDocs c la) docs
190 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
191
192 ------------------------------------------------------------------------
193 flowCorpusUser :: (FlowCmdM env err m, MkCorpus c)
194 => Lang
195 -> User
196 -> Either CorpusName [CorpusId]
197 -> Maybe c
198 -> [NodeId]
199 -> m CorpusId
200 flowCorpusUser l user corpusName ctype ids = do
201 -- User Flow
202 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
203 listId <- getOrMkList userCorpusId userId
204 _cooc <- insertDefaultNode NodeListCooc listId userId
205 -- TODO: check if present already, ignore
206 _ <- Doc.add userCorpusId ids
207
208 _tId <- insertDefaultNode NodeTexts userCorpusId userId
209 -- printDebug "Node Text Id" tId
210
211 -- User List Flow
212 (masterUserId, _masterRootId, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
213 ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
214 _userListId <- flowList_DbRepo listId ngs
215 _mastListId <- getOrMkList masterCorpusId masterUserId
216 -- _ <- insertOccsUpdates userCorpusId mastListId
217 -- printDebug "userListId" userListId
218 -- User Graph Flow
219 _ <- insertDefaultNode NodeDashboard userCorpusId userId
220 _ <- insertDefaultNode NodeGraph userCorpusId userId
221 --_ <- mkPhylo userCorpusId userId
222
223 -- Annuaire Flow
224 -- _ <- mkAnnuaire rootUserId userId
225 pure userCorpusId
226
227
228 insertMasterDocs :: ( FlowCmdM env err m
229 , FlowCorpus a
230 , MkCorpus c
231 )
232 => Maybe c
233 -> TermType Lang
234 -> [a]
235 -> m [DocId]
236 insertMasterDocs c lang hs = do
237 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
238
239 -- TODO Type NodeDocumentUnicised
240 let docs = map addUniqId hs
241 ids <- insertDb masterUserId masterCorpusId docs
242 let
243 ids' = map reId ids
244 documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' docs)
245 -- TODO
246 -- create a corpus with database name (CSV or PubMed)
247 -- add documents to the corpus (create node_node link)
248 -- this will enable global database monitoring
249
250 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
251 maps <- mapNodeIdNgrams
252 <$> documentIdWithNgrams (extractNgramsT $ withLang lang documentsWithId) documentsWithId
253
254 terms2id <- insertNgrams $ Map.keys maps
255 -- to be removed
256 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) maps
257
258 -- new
259 lId <- getOrMkList masterCorpusId masterUserId
260 mapCgramsId <- listInsertDb lId toNodeNgramsW'
261 $ map (first _ngramsTerms . second Map.keys)
262 $ Map.toList maps
263 -- insertDocNgrams
264 _return <- insertNodeNodeNgrams2
265 $ catMaybes [ NodeNodeNgrams2 <$> Just nId
266 <*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms'')
267 <*> Just (fromIntegral w :: Double)
268 | (terms'', mapNgramsTypes) <- Map.toList maps
269 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
270 , (nId, w) <- Map.toList mapNodeIdWeight
271 ]
272
273 _ <- Doc.add masterCorpusId ids'
274 _cooc <- insertDefaultNode NodeListCooc lId masterUserId
275 -- to be removed
276 _ <- insertDocNgrams lId indexedNgrams
277
278 pure ids'
279
280
281 ------------------------------------------------------------------------
282
283
284
285 ------------------------------------------------------------------------
286 viewUniqId' :: UniqId a
287 => a
288 -> (HashId, a)
289 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
290 where
291 err = panic "[ERROR] Database.Flow.toInsert"
292
293
294 toInserted :: [ReturnId]
295 -> Map HashId ReturnId
296 toInserted =
297 Map.fromList . map (\r -> (reUniqId r, r) )
298 . filter (\r -> reInserted r == True)
299
300 mergeData :: Map HashId ReturnId
301 -> Map HashId a
302 -> [DocumentWithId a]
303 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
304 where
305 toDocumentWithId (sha,hpd) =
306 DocumentWithId <$> fmap reId (lookup sha rs)
307 <*> Just hpd
308
309 ------------------------------------------------------------------------
310
311 instance HasText HyperdataContact
312 where
313 hasText = undefined
314
315 ------------------------------------------------------------------------
316 ------------------------------------------------------------------------
317
318 documentIdWithNgrams :: HasNodeError err
319 => (a
320 -> Cmd err (Map Ngrams (Map NgramsType Int)))
321 -> [DocumentWithId a]
322 -> Cmd err [DocumentIdWithNgrams a]
323 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
324 where
325 toDocumentIdWithNgrams d = do
326 e <- f $ documentData d
327 pure $ DocumentIdWithNgrams d e
328
329
330 ------------------------------------------------------------------------
331
332
333 instance ExtractNgramsT HyperdataContact
334 where
335 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
336 where
337 extract :: TermType Lang -> HyperdataContact
338 -> Cmd err (Map Ngrams (Map NgramsType Int))
339 extract _l hc' = do
340 let authors = map text2ngrams
341 $ maybe ["Nothing"] (\a -> [a])
342 $ view (hc_who . _Just . cw_lastName) hc'
343
344 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
345
346 instance HasText HyperdataDocument
347 where
348 hasText h = catMaybes [ _hd_title h
349 , _hd_abstract h
350 ]
351
352 instance ExtractNgramsT HyperdataDocument
353 where
354 extractNgramsT :: TermType Lang
355 -> HyperdataDocument
356 -> Cmd err (Map Ngrams (Map NgramsType Int))
357 extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
358 where
359 extractNgramsT' :: TermType Lang
360 -> HyperdataDocument
361 -> Cmd err (Map Ngrams (Map NgramsType Int))
362 extractNgramsT' lang' doc = do
363 let source = text2ngrams
364 $ maybe "Nothing" identity
365 $ _hd_source doc
366
367 institutes = map text2ngrams
368 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
369 $ _hd_institutes doc
370
371 authors = map text2ngrams
372 $ maybe ["Nothing"] (splitOn ", ")
373 $ _hd_authors doc
374
375 terms' <- map text2ngrams
376 <$> map (intercalate " " . _terms_label)
377 <$> concat
378 <$> liftBase (extractTerms lang' $ hasText doc)
379
380 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
381 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
382 <> [(a', Map.singleton Authors 1) | a' <- authors ]
383 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
384
385