]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Flow.hs
[FLOW] preparing Hyperdata ToNode instance
[gargantext.git] / src / Gargantext / Database / Action / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -- TODO-ACCESS:
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
13
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
16 -}
17
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
19
20 {-# LANGUAGE ConstraintKinds #-}
21 {-# LANGUAGE ConstrainedClassMethods #-}
22 {-# LANGUAGE ConstraintKinds #-}
23 {-# LANGUAGE InstanceSigs #-}
24 {-# LANGUAGE TemplateHaskell #-}
25
26 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
27 ( FlowCmdM
28 , getDataText
29 , flowDataText
30 , flow
31
32 , flowCorpusFile
33 , flowCorpus
34 , flowAnnuaire
35
36 , getOrMkRoot
37 , getOrMk_RootWithCorpus
38 , TermType(..)
39 , DataOrigin(..)
40 , allDataOrigins
41
42 , do_api
43 )
44 where
45
46 import Control.Lens ((^.), view, _Just, makeLenses)
47 import Data.Aeson.TH (deriveJSON)
48 import Data.Either
49 import Data.List (concat)
50 import qualified Data.Map as Map
51 import Data.Map (Map, lookup)
52 import Data.Maybe (Maybe(..), catMaybes, fromMaybe)
53 import Data.Monoid
54 import Data.Swagger
55 import Data.Text (splitOn, intercalate)
56 import Data.Time.Segment (jour)
57 import Data.Traversable (traverse)
58 import Data.Tuple.Extra (first, second)
59 import GHC.Generics (Generic)
60 import System.FilePath (FilePath)
61
62 import Gargantext.Core (Lang(..))
63 import Gargantext.Core.Flow.Types
64 import Gargantext.Core.Types (Terms(..))
65 import Gargantext.Core.Types.Individu (User(..))
66 import Gargantext.Core.Types.Main
67 import Gargantext.Database.Action.Flow.List
68 import Gargantext.Database.Action.Flow.Types
69 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams)
70 import Gargantext.Database.Query.Table.Node
71 import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
72 import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
73 import Gargantext.Database.Action.Search (searchDocInDatabase)
74 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName, nodeTypeId)
75 import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
76 import Gargantext.Database.Admin.Types.Hyperdata
77 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
78 import Gargantext.Database.Prelude
79 import Gargantext.Database.Query.Table.Ngrams
80 import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
81 import Gargantext.Database.Query.Table.NodeNodeNgrams2
82 import Gargantext.Database.Schema.Node (NodePoly(..))
83 import Gargantext.Core.Ext.IMT (toSchoolName)
84 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
85 import Gargantext.Core.Ext.IMTUser (deserialiseImtUsersFromFile)
86 import Gargantext.Core.Text
87 import Gargantext.Prelude
88 import Gargantext.Prelude.Crypto.Hash (Hash)
89 import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat)
90 import Gargantext.Core.Text.List (buildNgramsLists,StopSize(..))
91 import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
92 import Gargantext.Core.Text.Terms
93 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
94 import qualified Gargantext.Core.Text.Corpus.API as API
95 import qualified Data.Text as DT
96
97 ------------------------------------------------------------------------
98 -- TODO use internal with API name (could be old data)
99 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
100 | ExternalOrigin { _do_api :: API.ExternalAPIs }
101 -- TODO Web
102 deriving (Generic, Eq)
103
104 makeLenses ''DataOrigin
105 deriveJSON (unPrefix "_do_") ''DataOrigin
106 instance ToSchema DataOrigin where
107 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
108
109 allDataOrigins :: [DataOrigin]
110 allDataOrigins = map InternalOrigin API.externalAPIs
111 <> map ExternalOrigin API.externalAPIs
112
113 ---------------
114 data DataText = DataOld ![NodeId]
115 | DataNew ![[HyperdataDocument]]
116
117 -- TODO use the split parameter in config file
118 getDataText :: FlowCmdM env err m
119 => DataOrigin
120 -> TermType Lang
121 -> API.Query
122 -> Maybe API.Limit
123 -> m DataText
124 getDataText (ExternalOrigin api) la q li = liftBase $ DataNew
125 <$> splitEvery 500
126 <$> API.get api (_tt_lang la) q li
127 getDataText (InternalOrigin _) _la q _li = do
128 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
129 (UserName userMaster)
130 (Left "")
131 (Nothing :: Maybe HyperdataCorpus)
132 ids <- map fst <$> searchDocInDatabase cId (stemIt q)
133 pure $ DataOld ids
134
135 -------------------------------------------------------------------------------
136 flowDataText :: FlowCmdM env err m
137 => User
138 -> DataText
139 -> TermType Lang
140 -> CorpusId
141 -> m CorpusId
142 flowDataText u (DataOld ids) tt cid = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids
143 where
144 corpusType = (Nothing :: Maybe HyperdataCorpus)
145 flowDataText u (DataNew txt) tt cid = flowCorpus u (Right [cid]) tt txt
146
147 ------------------------------------------------------------------------
148 -- TODO use proxy
149 flowAnnuaire :: FlowCmdM env err m
150 => User
151 -> Either CorpusName [CorpusId]
152 -> (TermType Lang)
153 -> FilePath
154 -> m AnnuaireId
155 flowAnnuaire u n l filePath = do
156 docs <- liftBase $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
157 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
158
159 ------------------------------------------------------------------------
160 flowCorpusFile :: FlowCmdM env err m
161 => User
162 -> Either CorpusName [CorpusId]
163 -> Limit -- Limit the number of docs (for dev purpose)
164 -> TermType Lang -> FileFormat -> FilePath
165 -> m CorpusId
166 flowCorpusFile u n l la ff fp = do
167 docs <- liftBase ( splitEvery 500
168 <$> take l
169 <$> parseFile ff fp
170 )
171 flowCorpus u n la (map (map toHyperdataDocument) docs)
172
173 ------------------------------------------------------------------------
174 -- | TODO improve the needed type to create/update a corpus
175 -- (For now, Either is enough)
176 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
177 => User
178 -> Either CorpusName [CorpusId]
179 -> TermType Lang
180 -> [[a]]
181 -> m CorpusId
182 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
183
184
185 flow :: (FlowCmdM env err m, FlowCorpus a, MkCorpus c)
186 => Maybe c
187 -> User
188 -> Either CorpusName [CorpusId]
189 -> TermType Lang
190 -> [[a]]
191 -> m CorpusId
192 flow c u cn la docs = do
193 -- TODO if public insertMasterDocs else insertUserDocs
194 ids <- traverse (insertMasterDocs c la) docs
195 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
196
197 ------------------------------------------------------------------------
198 flowCorpusUser :: (FlowCmdM env err m, MkCorpus c)
199 => Lang
200 -> User
201 -> Either CorpusName [CorpusId]
202 -> Maybe c
203 -> [NodeId]
204 -> m CorpusId
205 flowCorpusUser l user corpusName ctype ids = do
206 -- User Flow
207 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
208 listId <- getOrMkList userCorpusId userId
209 _cooc <- insertDefaultNode NodeListCooc listId userId
210 -- TODO: check if present already, ignore
211 _ <- Doc.add userCorpusId ids
212
213 tId <- insertDefaultNode NodeTexts userCorpusId userId
214 printDebug "Node Text Ids:" tId
215
216 -- User List Flow
217 (masterUserId, _masterRootId, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
218 ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
219 _userListId <- flowList_DbRepo listId ngs
220 _mastListId <- getOrMkList masterCorpusId masterUserId
221 -- _ <- insertOccsUpdates userCorpusId mastListId
222 -- printDebug "userListId" userListId
223 -- User Graph Flow
224 _ <- insertDefaultNode NodeDashboard userCorpusId userId
225 _ <- insertDefaultNode NodeGraph userCorpusId userId
226 --_ <- mkPhylo userCorpusId userId
227 -- Annuaire Flow
228 -- _ <- mkAnnuaire rootUserId userId
229 pure userCorpusId
230
231 -- TODO Type NodeDocumentUnicised
232 insertDocs :: ( FlowCmdM env err m
233 , FlowCorpus a
234 )
235 => UserId
236 -> CorpusId
237 -> [a]
238 -> m ([DocId], [DocumentWithId a])
239 insertDocs uId cId hs = do
240 let docs = map addUniqId hs
241 newIds <- insertDb uId cId docs
242 printDebug "newIds" newIds
243 let
244 newIds' = map reId newIds
245 documentsWithId = mergeData (toInserted newIds) (Map.fromList $ map viewUniqId' docs)
246 _ <- Doc.add cId newIds'
247 pure (newIds', documentsWithId)
248
249 {-
250 -- TODO Maybe NodeId
251 toNode :: Hyperdata a => NodeType -> ParentId -> UserId -> a -> Node a
252 toNode NodeDocument p u h = Node 0 "" (nodeTypeId nt) u (Just p) n date h
253 where
254 n = maybe "No Title" (DT.take 255) (_hd_title h)
255 date = jour y m d
256 y = maybe 0 fromIntegral $ _hd_publication_year h
257 m = fromMaybe 1 $ _hd_publication_month h
258 d = fromMaybe 1 $ _hd_publication_day h
259 toNode _ _ _ _ = undefined
260 -}
261
262
263 insertMasterDocs :: ( FlowCmdM env err m
264 , FlowCorpus a
265 , MkCorpus c
266 )
267 => Maybe c
268 -> TermType Lang
269 -> [a]
270 -> m [DocId]
271 insertMasterDocs c lang hs = do
272 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
273 (ids', documentsWithId) <- insertDocs masterUserId masterCorpusId hs
274 -- (ids', documentsWithId) <- insertDocs masterUserId masterCorpusId (map (toNode NodeDocument masterCorpusId masterUserId ) hs )
275 _ <- Doc.add masterCorpusId ids'
276 -- TODO
277 -- create a corpus with database name (CSV or PubMed)
278 -- add documents to the corpus (create node_node link)
279 -- this will enable global database monitoring
280
281 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
282 mapNgramsDocs <- mapNodeIdNgrams
283 <$> documentIdWithNgrams (extractNgramsT $ withLang lang documentsWithId) documentsWithId
284
285 terms2id <- insertNgrams $ Map.keys mapNgramsDocs
286 -- to be removed
287 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) mapNgramsDocs
288
289 -- new
290 lId <- getOrMkList masterCorpusId masterUserId
291 mapCgramsId <- listInsertDb lId toNodeNgramsW'
292 $ map (first _ngramsTerms . second Map.keys)
293 $ Map.toList mapNgramsDocs
294 -- insertDocNgrams
295 _return <- insertNodeNodeNgrams2
296 $ catMaybes [ NodeNodeNgrams2 <$> Just nId
297 <*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms'')
298 <*> Just (fromIntegral w :: Double)
299 | (terms'', mapNgramsTypes) <- Map.toList mapNgramsDocs
300 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
301 , (nId, w) <- Map.toList mapNodeIdWeight
302 ]
303
304 _cooc <- insertDefaultNode NodeListCooc lId masterUserId
305 -- to be removed
306 _ <- insertDocNgrams lId indexedNgrams
307 pure ids'
308
309 ------------------------------------------------------------------------
310 ------------------------------------------------------------------------
311 viewUniqId' :: UniqId a
312 => a
313 -> (Hash, a)
314 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
315 where
316 err = panic "[ERROR] Database.Flow.toInsert"
317
318
319 toInserted :: [ReturnId]
320 -> Map Hash ReturnId
321 toInserted =
322 Map.fromList . map (\r -> (reUniqId r, r) )
323 . filter (\r -> reInserted r == True)
324
325 mergeData :: Map Hash ReturnId
326 -> Map Hash a
327 -> [DocumentWithId a]
328 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
329 where
330 toDocumentWithId (sha,hpd) =
331 DocumentWithId <$> fmap reId (lookup sha rs)
332 <*> Just hpd
333
334 ------------------------------------------------------------------------
335 instance HasText HyperdataContact
336 where
337 hasText = undefined
338 ------------------------------------------------------------------------
339 ------------------------------------------------------------------------
340 documentIdWithNgrams :: HasNodeError err
341 => (a
342 -> Cmd err (Map Ngrams (Map NgramsType Int)))
343 -> [DocumentWithId a]
344 -> Cmd err [DocumentIdWithNgrams a]
345 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
346 where
347 toDocumentIdWithNgrams d = do
348 e <- f $ documentData d
349 pure $ DocumentIdWithNgrams d e
350
351 ------------------------------------------------------------------------
352 instance ExtractNgramsT HyperdataContact
353 where
354 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
355 where
356 extract :: TermType Lang -> HyperdataContact
357 -> Cmd err (Map Ngrams (Map NgramsType Int))
358 extract _l hc' = do
359 let authors = map text2ngrams
360 $ maybe ["Nothing"] (\a -> [a])
361 $ view (hc_who . _Just . cw_lastName) hc'
362
363 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
364
365 instance HasText HyperdataDocument
366 where
367 hasText h = catMaybes [ _hd_title h
368 , _hd_abstract h
369 ]
370
371 instance HasText (Node HyperdataDocument)
372 where
373 hasText n = catMaybes [ _hd_title h
374 , _hd_abstract h
375 ]
376 where
377 h = _node_hyperdata n
378
379
380
381 instance ExtractNgramsT HyperdataDocument
382 where
383 extractNgramsT :: TermType Lang
384 -> HyperdataDocument
385 -> Cmd err (Map Ngrams (Map NgramsType Int))
386 extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
387 where
388 extractNgramsT' :: TermType Lang
389 -> HyperdataDocument
390 -> Cmd err (Map Ngrams (Map NgramsType Int))
391 extractNgramsT' lang' doc = do
392 let source = text2ngrams
393 $ maybe "Nothing" identity
394 $ _hd_source doc
395
396 institutes = map text2ngrams
397 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
398 $ _hd_institutes doc
399
400 authors = map text2ngrams
401 $ maybe ["Nothing"] (splitOn ", ")
402 $ _hd_authors doc
403
404 terms' <- map text2ngrams
405 <$> map (intercalate " " . _terms_label)
406 <$> concat
407 <$> liftBase (extractTerms lang' $ hasText doc)
408
409 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
410 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
411 <> [(a', Map.singleton Authors 1) | a' <- authors ]
412 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
413
414