]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Flow.hs
[Crypto] files now in G.Core.Crypto
[gargantext.git] / src / Gargantext / Database / Action / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -- TODO-ACCESS:
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
13
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
16 -}
17
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
19
20 {-# LANGUAGE ConstraintKinds #-}
21 {-# LANGUAGE ConstrainedClassMethods #-}
22 {-# LANGUAGE ConstraintKinds #-}
23 {-# LANGUAGE InstanceSigs #-}
24 {-# LANGUAGE TemplateHaskell #-}
25
26 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
27 ( FlowCmdM
28 , getDataText
29 , flowDataText
30
31 , flowCorpusFile
32 , flowCorpus
33 , flowAnnuaire
34
35 , getOrMkRoot
36 , getOrMk_RootWithCorpus
37 , TermType(..)
38 , DataOrigin(..)
39 , allDataOrigins
40
41 , do_api
42 )
43 where
44
45 import Control.Lens ((^.), view, _Just, makeLenses)
46 import Data.Aeson.TH (deriveJSON)
47 import Data.Either
48 import Data.List (concat)
49 import qualified Data.Map as Map
50 import Data.Map (Map, lookup)
51 import Data.Maybe (Maybe(..), catMaybes)
52 import Data.Monoid
53 import Data.Swagger
54 import Data.Text (splitOn, intercalate)
55 import Data.Traversable (traverse)
56 import Data.Tuple.Extra (first, second)
57 import GHC.Generics (Generic)
58 import System.FilePath (FilePath)
59
60 import Gargantext.Core (Lang(..))
61 import Gargantext.Core.Flow.Types
62 import Gargantext.Core.Types (Terms(..))
63 import Gargantext.Core.Types.Individu (User(..))
64 import Gargantext.Core.Types.Main
65 import Gargantext.Database.Action.Flow.List
66 import Gargantext.Database.Action.Flow.Types
67 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams)
68 import Gargantext.Database.Query.Table.Node
69 import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
70 import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
71 import Gargantext.Database.Action.Search (searchDocInDatabase)
72 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
73 import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
74 import Gargantext.Database.Admin.Types.Hyperdata
75 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
76 import Gargantext.Database.Prelude
77 import Gargantext.Database.Query.Table.Ngrams
78 import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
79 import Gargantext.Database.Query.Table.NodeNodeNgrams2
80 import Gargantext.Ext.IMT (toSchoolName)
81 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
82 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
83 import Gargantext.Text
84 import Gargantext.Prelude
85 import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat)
86 import Gargantext.Text.List (buildNgramsLists,StopSize(..))
87 import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
88 import Gargantext.Text.Terms
89 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
90 import qualified Gargantext.Text.Corpus.API as API
91
92 ------------------------------------------------------------------------
93 -- TODO use internal with API name (could be old data)
94 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
95 | ExternalOrigin { _do_api :: API.ExternalAPIs }
96 -- TODO Web
97 deriving (Generic, Eq)
98
99 makeLenses ''DataOrigin
100 deriveJSON (unPrefix "_do_") ''DataOrigin
101 instance ToSchema DataOrigin where
102 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
103
104 allDataOrigins :: [DataOrigin]
105 allDataOrigins = map InternalOrigin API.externalAPIs
106 <> map ExternalOrigin API.externalAPIs
107
108 ---------------
109 data DataText = DataOld ![NodeId]
110 | DataNew ![[HyperdataDocument]]
111
112 -- TODO use the split parameter in config file
113 getDataText :: FlowCmdM env err m
114 => DataOrigin
115 -> TermType Lang
116 -> API.Query
117 -> Maybe API.Limit
118 -> m DataText
119 getDataText (ExternalOrigin api) la q li = liftBase $ DataNew
120 <$> splitEvery 500
121 <$> API.get api (_tt_lang la) q li
122 getDataText (InternalOrigin _) _la q _li = do
123 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
124 (UserName userMaster)
125 (Left "")
126 (Nothing :: Maybe HyperdataCorpus)
127 ids <- map fst <$> searchDocInDatabase cId (stemIt q)
128 pure $ DataOld ids
129
130 -------------------------------------------------------------------------------
131 flowDataText :: FlowCmdM env err m
132 => User
133 -> DataText
134 -> TermType Lang
135 -> CorpusId
136 -> m CorpusId
137 flowDataText u (DataOld ids) tt cid = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids
138 where
139 corpusType = (Nothing :: Maybe HyperdataCorpus)
140 flowDataText u (DataNew txt) tt cid = flowCorpus u (Right [cid]) tt txt
141
142 ------------------------------------------------------------------------
143 -- TODO use proxy
144 flowAnnuaire :: FlowCmdM env err m
145 => User
146 -> Either CorpusName [CorpusId]
147 -> (TermType Lang)
148 -> FilePath
149 -> m AnnuaireId
150 flowAnnuaire u n l filePath = do
151 docs <- liftBase $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
152 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
153
154 ------------------------------------------------------------------------
155 flowCorpusFile :: FlowCmdM env err m
156 => User
157 -> Either CorpusName [CorpusId]
158 -> Limit -- Limit the number of docs (for dev purpose)
159 -> TermType Lang -> FileFormat -> FilePath
160 -> m CorpusId
161 flowCorpusFile u n l la ff fp = do
162 docs <- liftBase ( splitEvery 500
163 <$> take l
164 <$> parseFile ff fp
165 )
166 flowCorpus u n la (map (map toHyperdataDocument) docs)
167
168 ------------------------------------------------------------------------
169 -- | TODO improve the needed type to create/update a corpus
170 -- (For now, Either is enough)
171 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
172 => User
173 -> Either CorpusName [CorpusId]
174 -> TermType Lang
175 -> [[a]]
176 -> m CorpusId
177 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
178
179
180 flow :: (FlowCmdM env err m, FlowCorpus a, MkCorpus c)
181 => Maybe c
182 -> User
183 -> Either CorpusName [CorpusId]
184 -> TermType Lang
185 -> [[a]]
186 -> m CorpusId
187 flow c u cn la docs = do
188 ids <- traverse (insertMasterDocs c la) docs
189 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
190
191 ------------------------------------------------------------------------
192 flowCorpusUser :: (FlowCmdM env err m, MkCorpus c)
193 => Lang
194 -> User
195 -> Either CorpusName [CorpusId]
196 -> Maybe c
197 -> [NodeId]
198 -> m CorpusId
199 flowCorpusUser l user corpusName ctype ids = do
200 -- User Flow
201 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
202 listId <- getOrMkList userCorpusId userId
203 _cooc <- insertDefaultNode NodeListCooc listId userId
204 -- TODO: check if present already, ignore
205 _ <- Doc.add userCorpusId ids
206
207 _tId <- insertDefaultNode NodeTexts userCorpusId userId
208 -- printDebug "Node Text Id" tId
209
210 -- User List Flow
211 (masterUserId, _masterRootId, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
212 ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
213 _userListId <- flowList_DbRepo listId ngs
214 _mastListId <- getOrMkList masterCorpusId masterUserId
215 -- _ <- insertOccsUpdates userCorpusId mastListId
216 -- printDebug "userListId" userListId
217 -- User Graph Flow
218 _ <- insertDefaultNode NodeDashboard userCorpusId userId
219 _ <- insertDefaultNode NodeGraph userCorpusId userId
220 --_ <- mkPhylo userCorpusId userId
221
222 -- Annuaire Flow
223 -- _ <- mkAnnuaire rootUserId userId
224 pure userCorpusId
225
226
227 insertMasterDocs :: ( FlowCmdM env err m
228 , FlowCorpus a
229 , MkCorpus c
230 )
231 => Maybe c
232 -> TermType Lang
233 -> [a]
234 -> m [DocId]
235 insertMasterDocs c lang hs = do
236 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
237
238 -- TODO Type NodeDocumentUnicised
239 let docs = map addUniqId hs
240 ids <- insertDb masterUserId masterCorpusId docs
241 let
242 ids' = map reId ids
243 documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' docs)
244 -- TODO
245 -- create a corpus with database name (CSV or PubMed)
246 -- add documents to the corpus (create node_node link)
247 -- this will enable global database monitoring
248
249 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
250 maps <- mapNodeIdNgrams
251 <$> documentIdWithNgrams (extractNgramsT $ withLang lang documentsWithId) documentsWithId
252
253 terms2id <- insertNgrams $ Map.keys maps
254 -- to be removed
255 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) maps
256
257 -- new
258 lId <- getOrMkList masterCorpusId masterUserId
259 mapCgramsId <- listInsertDb lId toNodeNgramsW'
260 $ map (first _ngramsTerms . second Map.keys)
261 $ Map.toList maps
262 -- insertDocNgrams
263 _return <- insertNodeNodeNgrams2
264 $ catMaybes [ NodeNodeNgrams2 <$> Just nId
265 <*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms'')
266 <*> Just (fromIntegral w :: Double)
267 | (terms'', mapNgramsTypes) <- Map.toList maps
268 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
269 , (nId, w) <- Map.toList mapNodeIdWeight
270 ]
271
272 _ <- Doc.add masterCorpusId ids'
273 _cooc <- insertDefaultNode NodeListCooc lId masterUserId
274 -- to be removed
275 _ <- insertDocNgrams lId indexedNgrams
276
277 pure ids'
278
279
280 ------------------------------------------------------------------------
281
282
283
284 ------------------------------------------------------------------------
285 viewUniqId' :: UniqId a
286 => a
287 -> (HashId, a)
288 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
289 where
290 err = panic "[ERROR] Database.Flow.toInsert"
291
292
293 toInserted :: [ReturnId]
294 -> Map HashId ReturnId
295 toInserted =
296 Map.fromList . map (\r -> (reUniqId r, r) )
297 . filter (\r -> reInserted r == True)
298
299 mergeData :: Map HashId ReturnId
300 -> Map HashId a
301 -> [DocumentWithId a]
302 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
303 where
304 toDocumentWithId (sha,hpd) =
305 DocumentWithId <$> fmap reId (lookup sha rs)
306 <*> Just hpd
307
308 ------------------------------------------------------------------------
309
310 instance HasText HyperdataContact
311 where
312 hasText = undefined
313
314 ------------------------------------------------------------------------
315 ------------------------------------------------------------------------
316
317 documentIdWithNgrams :: HasNodeError err
318 => (a
319 -> Cmd err (Map Ngrams (Map NgramsType Int)))
320 -> [DocumentWithId a]
321 -> Cmd err [DocumentIdWithNgrams a]
322 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
323 where
324 toDocumentIdWithNgrams d = do
325 e <- f $ documentData d
326 pure $ DocumentIdWithNgrams d e
327
328
329 ------------------------------------------------------------------------
330
331
332 instance ExtractNgramsT HyperdataContact
333 where
334 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
335 where
336 extract :: TermType Lang -> HyperdataContact
337 -> Cmd err (Map Ngrams (Map NgramsType Int))
338 extract _l hc' = do
339 let authors = map text2ngrams
340 $ maybe ["Nothing"] (\a -> [a])
341 $ view (hc_who . _Just . cw_lastName) hc'
342
343 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
344
345 instance HasText HyperdataDocument
346 where
347 hasText h = catMaybes [ _hd_title h
348 , _hd_abstract h
349 ]
350
351 instance ExtractNgramsT HyperdataDocument
352 where
353 extractNgramsT :: TermType Lang
354 -> HyperdataDocument
355 -> Cmd err (Map Ngrams (Map NgramsType Int))
356 extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
357 where
358 extractNgramsT' :: TermType Lang
359 -> HyperdataDocument
360 -> Cmd err (Map Ngrams (Map NgramsType Int))
361 extractNgramsT' lang' doc = do
362 let source = text2ngrams
363 $ maybe "Nothing" identity
364 $ _hd_source doc
365
366 institutes = map text2ngrams
367 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
368 $ _hd_institutes doc
369
370 authors = map text2ngrams
371 $ maybe ["Nothing"] (splitOn ", ")
372 $ _hd_authors doc
373
374 terms' <- map text2ngrams
375 <$> map (intercalate " " . _terms_label)
376 <$> concat
377 <$> liftBase (extractTerms lang' $ hasText doc)
378
379 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
380 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
381 <> [(a', Map.singleton Authors 1) | a' <- authors ]
382 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
383
384