]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Flow.hs
[CLEAN] type
[gargantext.git] / src / Gargantext / Database / Action / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -- TODO-ACCESS:
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
13
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
16 -}
17
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
19
20 {-# LANGUAGE ConstraintKinds #-}
21 {-# LANGUAGE RankNTypes #-}
22 {-# LANGUAGE ConstrainedClassMethods #-}
23 {-# LANGUAGE ConstraintKinds #-}
24 {-# LANGUAGE DeriveGeneric #-}
25 {-# LANGUAGE FlexibleContexts #-}
26 {-# LANGUAGE InstanceSigs #-}
27 {-# LANGUAGE NoImplicitPrelude #-}
28 {-# LANGUAGE OverloadedStrings #-}
29 {-# LANGUAGE TemplateHaskell #-}
30
31 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
32 ( FlowCmdM
33 , getDataText
34 , flowDataText
35
36 , flowCorpusFile
37 , flowCorpus
38 , flowAnnuaire
39
40 , getOrMkRoot
41 , getOrMk_RootWithCorpus
42 , TermType(..)
43 , DataOrigin(..)
44 , allDataOrigins
45
46 , do_api
47 )
48 where
49
50 import Control.Lens ((^.), view, _Just, makeLenses)
51 import Data.Aeson.TH (deriveJSON)
52 import Data.Either
53 import Data.List (concat)
54 import Data.Map (Map, lookup)
55 import Data.Maybe (Maybe(..), catMaybes)
56 import Data.Monoid
57 import Data.Swagger
58 import Data.Text (splitOn, intercalate)
59 import Data.Traversable (traverse)
60 import Data.Tuple.Extra (first, second)
61 import Debug.Trace (trace)
62 import Gargantext.Core (Lang(..))
63 import Gargantext.Core.Flow.Types
64 import Gargantext.Core.Types (Terms(..))
65 import Gargantext.Core.Types.Individu (User(..))
66 import Gargantext.Core.Types.Main
67 import Gargantext.Database.Action.Flow.List
68 import Gargantext.Database.Action.Flow.Types
69 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams)
70 import Gargantext.Database.Action.Query.Node
71 import Gargantext.Database.Action.Query.Node.Contact -- (HyperdataContact(..), ContactWho(..))
72 import Gargantext.Database.Action.Query.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
73 import Gargantext.Database.Action.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
74 import Gargantext.Database.Action.Search (searchInDatabase)
75 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
76 import Gargantext.Database.Admin.Types.Errors (HasNodeError(..))
77 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
78 import Gargantext.Database.Admin.Utils (Cmd)
79 import Gargantext.Database.Schema.Ngrams -- (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
80 import Gargantext.Database.Schema.NodeNgrams (listInsertDb , getCgramsId)
81 import Gargantext.Database.Schema.NodeNodeNgrams2 -- (NodeNodeNgrams2, insertNodeNodeNgrams2)
82 import Gargantext.Ext.IMT (toSchoolName)
83 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
84 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
85 import Gargantext.Prelude
86 import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat)
87 import Gargantext.Text.List (buildNgramsLists,StopSize(..))
88 import Gargantext.Text.Terms (TermType(..), tt_lang, extractTerms, uniText)
89 import Gargantext.Text.Terms.Eleve (buildTries, toToken)
90 import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
91 import GHC.Generics (Generic)
92 import Prelude (String)
93 import System.FilePath (FilePath)
94 import qualified Data.List as List
95 import qualified Data.Map as Map
96 import qualified Data.Text as Text
97 import qualified Gargantext.Database.Action.Query.Node.Document.Add as Doc (add)
98 import qualified Gargantext.Text.Corpus.API as API
99
100 ------------------------------------------------------------------------
101 -- TODO use internal with API name (could be old data)
102 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
103 | ExternalOrigin { _do_api :: API.ExternalAPIs }
104 -- TODO Web
105 deriving (Generic, Eq)
106
107 makeLenses ''DataOrigin
108 deriveJSON (unPrefix "_do_") ''DataOrigin
109 instance ToSchema DataOrigin where
110 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
111
112 allDataOrigins :: [DataOrigin]
113 allDataOrigins = map InternalOrigin API.externalAPIs
114 <> map ExternalOrigin API.externalAPIs
115
116 ---------------
117
118 data DataText = DataOld ![NodeId]
119 | DataNew ![[HyperdataDocument]]
120
121
122 -- TODO use the split parameter in config file
123 getDataText :: FlowCmdM env err m
124 => DataOrigin
125 -> TermType Lang
126 -> API.Query
127 -> Maybe API.Limit
128 -> m DataText
129 getDataText (ExternalOrigin api) la q li = liftBase $ DataNew
130 <$> splitEvery 500
131 <$> API.get api (_tt_lang la) q li
132 getDataText (InternalOrigin _) _la q _li = do
133 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
134 (UserName userMaster)
135 (Left "")
136 (Nothing :: Maybe HyperdataCorpus)
137 ids <- map fst <$> searchInDatabase cId (stemIt q)
138 pure $ DataOld ids
139
140 -------------------------------------------------------------------------------
141 flowDataText :: FlowCmdM env err m
142 => User
143 -> DataText
144 -> TermType Lang
145 -> CorpusId
146 -> m CorpusId
147 flowDataText u (DataOld ids) tt cid = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids
148 where
149 corpusType = (Nothing :: Maybe HyperdataCorpus)
150 flowDataText u (DataNew txt) tt cid = flowCorpus u (Right [cid]) tt txt
151
152 ------------------------------------------------------------------------
153 -- TODO use proxy
154 flowAnnuaire :: FlowCmdM env err m
155 => User
156 -> Either CorpusName [CorpusId]
157 -> (TermType Lang)
158 -> FilePath
159 -> m AnnuaireId
160 flowAnnuaire u n l filePath = do
161 docs <- liftBase $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
162 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
163
164 ------------------------------------------------------------------------
165 flowCorpusFile :: FlowCmdM env err m
166 => User
167 -> Either CorpusName [CorpusId]
168 -> Limit -- Limit the number of docs (for dev purpose)
169 -> TermType Lang -> FileFormat -> FilePath
170 -> m CorpusId
171 flowCorpusFile u n l la ff fp = do
172 docs <- liftBase ( splitEvery 500
173 <$> take l
174 <$> parseFile ff fp
175 )
176 flowCorpus u n la (map (map toHyperdataDocument) docs)
177
178 ------------------------------------------------------------------------
179 -- | TODO improve the needed type to create/update a corpus
180 -- (For now, Either is enough)
181 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
182 => User
183 -> Either CorpusName [CorpusId]
184 -> TermType Lang
185 -> [[a]]
186 -> m CorpusId
187 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
188
189
190 flow :: (FlowCmdM env err m, FlowCorpus a, MkCorpus c)
191 => Maybe c
192 -> User
193 -> Either CorpusName [CorpusId]
194 -> TermType Lang
195 -> [[a]]
196 -> m CorpusId
197 flow c u cn la docs = do
198 ids <- traverse (insertMasterDocs c la) docs
199 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
200
201 ------------------------------------------------------------------------
202 flowCorpusUser :: (FlowCmdM env err m, MkCorpus c)
203 => Lang
204 -> User
205 -> Either CorpusName [CorpusId]
206 -> Maybe c
207 -> [NodeId]
208 -> m CorpusId
209 flowCorpusUser l user corpusName ctype ids = do
210 -- User Flow
211 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
212 listId <- getOrMkList userCorpusId userId
213 _cooc <- mkNode NodeListCooc listId userId
214 -- TODO: check if present already, ignore
215 _ <- Doc.add userCorpusId ids
216
217 _tId <- mkNode NodeTexts userCorpusId userId
218 -- printDebug "Node Text Id" tId
219
220 -- User List Flow
221 (masterUserId, _masterRootId, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
222 ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
223 _userListId <- flowList_DbRepo listId ngs
224 _mastListId <- getOrMkList masterCorpusId masterUserId
225 -- _ <- insertOccsUpdates userCorpusId mastListId
226 -- printDebug "userListId" userListId
227 -- User Graph Flow
228 _ <- mkDashboard userCorpusId userId
229 _ <- mkGraph userCorpusId userId
230 --_ <- mkPhylo userCorpusId userId
231
232 -- Annuaire Flow
233 -- _ <- mkAnnuaire rootUserId userId
234 pure userCorpusId
235
236
237 insertMasterDocs :: ( FlowCmdM env err m
238 , FlowCorpus a
239 , MkCorpus c
240 )
241 => Maybe c
242 -> TermType Lang
243 -> [a]
244 -> m [DocId]
245 insertMasterDocs c lang hs = do
246 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
247
248 -- TODO Type NodeDocumentUnicised
249 let docs = map addUniqId hs
250 ids <- insertDb masterUserId masterCorpusId docs
251 let
252 ids' = map reId ids
253 documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' docs)
254 -- TODO
255 -- create a corpus with database name (CSV or PubMed)
256 -- add documents to the corpus (create node_node link)
257 -- this will enable global database monitoring
258
259 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
260 maps <- mapNodeIdNgrams
261 <$> documentIdWithNgrams (extractNgramsT $ withLang lang documentsWithId) documentsWithId
262
263 terms2id <- insertNgrams $ Map.keys maps
264 -- to be removed
265 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) maps
266
267 -- new
268 lId <- getOrMkList masterCorpusId masterUserId
269 mapCgramsId <- listInsertDb lId toNodeNgramsW'
270 $ map (first _ngramsTerms . second Map.keys)
271 $ Map.toList maps
272 -- insertDocNgrams
273 _return <- insertNodeNodeNgrams2
274 $ catMaybes [ NodeNodeNgrams2 <$> Just nId
275 <*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms)
276 <*> Just (fromIntegral w :: Double)
277 | (terms, mapNgramsTypes) <- Map.toList maps
278 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
279 , (nId, w) <- Map.toList mapNodeIdWeight
280 ]
281
282 _ <- Doc.add masterCorpusId ids'
283 _cooc <- mkNode NodeListCooc lId masterUserId
284 -- to be removed
285 _ <- insertDocNgrams lId indexedNgrams
286
287 pure ids'
288
289
290 withLang :: HasText a
291 => TermType Lang
292 -> [DocumentWithId a]
293 -> TermType Lang
294 withLang (Unsupervised l n s m) ns = Unsupervised l n s m'
295 where
296 m' = case m of
297 Nothing -> trace ("buildTries here" :: String)
298 $ Just
299 $ buildTries n ( fmap toToken $ uniText
300 $ Text.intercalate " . "
301 $ List.concat
302 $ map hasText ns
303 )
304 just_m -> just_m
305 withLang l _ = l
306
307
308 ------------------------------------------------------------------------
309 viewUniqId' :: UniqId a
310 => a
311 -> (HashId, a)
312 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
313 where
314 err = panic "[ERROR] Database.Flow.toInsert"
315
316
317 toInserted :: [ReturnId]
318 -> Map HashId ReturnId
319 toInserted =
320 Map.fromList . map (\r -> (reUniqId r, r) )
321 . filter (\r -> reInserted r == True)
322
323 mergeData :: Map HashId ReturnId
324 -> Map HashId a
325 -> [DocumentWithId a]
326 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
327 where
328 toDocumentWithId (sha,hpd) =
329 DocumentWithId <$> fmap reId (lookup sha rs)
330 <*> Just hpd
331
332 ------------------------------------------------------------------------
333
334 instance HasText HyperdataContact
335 where
336 hasText = undefined
337
338 instance ExtractNgramsT HyperdataContact
339 where
340 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
341 where
342 extract :: TermType Lang -> HyperdataContact
343 -> Cmd err (Map Ngrams (Map NgramsType Int))
344 extract _l hc' = do
345 let authors = map text2ngrams
346 $ maybe ["Nothing"] (\a -> [a])
347 $ view (hc_who . _Just . cw_lastName) hc'
348
349 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
350
351 instance HasText HyperdataDocument
352 where
353 hasText h = catMaybes [ _hyperdataDocument_title h
354 , _hyperdataDocument_abstract h
355 ]
356
357 instance ExtractNgramsT HyperdataDocument
358 where
359 extractNgramsT :: TermType Lang
360 -> HyperdataDocument
361 -> Cmd err (Map Ngrams (Map NgramsType Int))
362 extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
363 where
364 extractNgramsT' :: TermType Lang
365 -> HyperdataDocument
366 -> Cmd err (Map Ngrams (Map NgramsType Int))
367 extractNgramsT' lang' doc = do
368 let source = text2ngrams
369 $ maybe "Nothing" identity
370 $ _hyperdataDocument_source doc
371
372 institutes = map text2ngrams
373 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
374 $ _hyperdataDocument_institutes doc
375
376 authors = map text2ngrams
377 $ maybe ["Nothing"] (splitOn ", ")
378 $ _hyperdataDocument_authors doc
379
380 terms' <- map text2ngrams
381 <$> map (intercalate " " . _terms_label)
382 <$> concat
383 <$> liftBase (extractTerms lang' $ hasText doc)
384
385 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
386 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
387 <> [(a', Map.singleton Authors 1) | a' <- authors ]
388 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
389
390 filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
391 -> Map Ngrams (Map NgramsType Int)
392 filterNgramsT s ms = Map.fromList $ map (\a -> filter' s a) $ Map.toList ms
393 where
394 filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
395 True -> (ng,y)
396 False -> (Ngrams (Text.take s' t) n , y)
397
398
399 documentIdWithNgrams :: HasNodeError err
400 => (a
401 -> Cmd err (Map Ngrams (Map NgramsType Int)))
402 -> [DocumentWithId a]
403 -> Cmd err [DocumentIdWithNgrams a]
404 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
405 where
406 toDocumentIdWithNgrams d = do
407 e <- f $ documentData d
408 pure $ DocumentIdWithNgrams d e
409