]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Flow.hs
[API|Query] WIP need to fit query with frontend
[gargantext.git] / src / Gargantext / Database / Action / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -- TODO-ACCESS:
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
13
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
16 -}
17
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
19
20 {-# LANGUAGE ConstraintKinds #-}
21 {-# LANGUAGE RankNTypes #-}
22 {-# LANGUAGE ConstrainedClassMethods #-}
23 {-# LANGUAGE ConstraintKinds #-}
24 {-# LANGUAGE DeriveGeneric #-}
25 {-# LANGUAGE FlexibleContexts #-}
26 {-# LANGUAGE InstanceSigs #-}
27 {-# LANGUAGE NoImplicitPrelude #-}
28 {-# LANGUAGE OverloadedStrings #-}
29 {-# LANGUAGE TemplateHaskell #-}
30
31 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
32 ( FlowCmdM
33 , getDataText
34 , flowDataText
35
36 , flowCorpusFile
37 , flowCorpus
38 , flowAnnuaire
39
40 , getOrMkRoot
41 , getOrMk_RootWithCorpus
42 , TermType(..)
43 , DataOrigin(..)
44 )
45 where
46
47 import Control.Lens ((^.), view, _Just, makeLenses)
48 import Data.Aeson.TH (deriveJSON)
49 import Data.Either
50 import Data.List (concat)
51 import Data.Map (Map, lookup)
52 import Data.Maybe (Maybe(..), catMaybes)
53 import Data.Monoid
54 import Data.Swagger
55 import Data.Text (Text, splitOn, intercalate)
56 import Data.Traversable (traverse)
57 import Data.Tuple.Extra (first, second)
58 import Debug.Trace (trace)
59 import Gargantext.Core (Lang(..))
60 import Gargantext.Core.Flow.Types
61 import Gargantext.Core.Types (Terms(..))
62 import Gargantext.Core.Types.Individu (User(..))
63 import Gargantext.Core.Types.Main
64 import Gargantext.Database.Action.Flow.List
65 import Gargantext.Database.Action.Flow.Types
66 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams)
67 import Gargantext.Database.Action.Query.Node
68 import Gargantext.Database.Action.Query.Node.Contact -- (HyperdataContact(..), ContactWho(..))
69 import Gargantext.Database.Action.Query.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
70 import Gargantext.Database.Action.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
71 import Gargantext.Database.Action.Search (searchInDatabase)
72 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
73 import Gargantext.Database.Admin.Types.Errors (HasNodeError(..))
74 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
75 import Gargantext.Database.Admin.Utils (Cmd)
76 import Gargantext.Database.Schema.Ngrams -- (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
77 import Gargantext.Database.Schema.NodeNgrams (listInsertDb , getCgramsId)
78 import Gargantext.Database.Schema.NodeNodeNgrams2 -- (NodeNodeNgrams2, insertNodeNodeNgrams2)
79 import Gargantext.Ext.IMT (toSchoolName)
80 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
81 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
82 import Gargantext.Prelude
83 import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat)
84 import Gargantext.Text.List (buildNgramsLists,StopSize(..))
85 import qualified Gargantext.Text.Terms as GTT (TermType(..), tt_lang, extractTerms, uniText)
86 import Gargantext.Text.Terms.Eleve (buildTries, toToken)
87 import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
88 import GHC.Generics (Generic)
89 import Prelude (String)
90 import System.FilePath (FilePath)
91 import qualified Data.List as List
92 import qualified Data.Map as Map
93 import qualified Data.Text as Text
94 import qualified Gargantext.Database.Action.Query.Node.Document.Add as Doc (add)
95 import qualified Gargantext.Text.Corpus.API as API
96
97 ------------------------------------------------------------------------
98 -- TODO use internal with API name (could be old data)
99 data DataOrigin = Internal Gargantext
100 | External API.ExternalAPIs
101 -- TODO Web
102
103 data DataText = DataOld ![NodeId]
104 | DataNew ![[HyperdataDocument]]
105
106
107 -- TODO use the split parameter in config file
108 getDataText :: FlowCmdM env err m
109 => DataOrigin
110 -> TermType Lang
111 -> API.Query
112 -> Maybe API.Limit
113 -> m DataText
114 getDataText (External api) la q li = liftBase $ DataNew
115 <$> splitEvery 500
116 <$> API.get api (_tt_lang la) q li
117 getDataText Gargantext la q li = do
118 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
119 (UserName userMaster)
120 (Left "")
121 (Nothing :: Maybe HyperdataCorpus)
122 ids <- map fst <$> searchInDatabase cId (stemIt q)
123 pure $ DataOld ids
124
125 -------------------------------------------------------------------------------
126 -- API for termType
127 data TermType lang
128 = Mono { _tt_lang :: lang }
129 | Multi { _tt_lang :: lang }
130 | MonoMulti { _tt_lang :: lang }
131 | Unsupervised { _tt_lang :: lang
132 , _tt_windowSize :: Int
133 , _tt_ngramsSize :: Int
134 }
135 deriving Generic
136
137 -- | GTT.TermType as a complex type in Unsupervised configuration that is not needed
138 -- for the API use
139 tta2tt :: TermType lang -> GTT.TermType lang
140 tta2tt (Mono l) = GTT.Mono l
141 tta2tt (Multi l) = GTT.Multi l
142 tta2tt (MonoMulti l) = GTT.MonoMulti l
143 tta2tt (Unsupervised la w ng) = GTT.Unsupervised la w ng Nothing
144
145 makeLenses ''TermType
146 deriveJSON (unPrefix "_tt_") ''TermType
147
148 instance (ToSchema a) => ToSchema (TermType a) where
149 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_tta_")
150
151
152 flowDataText :: FlowCmdM env err m
153 => User
154 -> DataText
155 -> TermType Lang
156 -> CorpusId
157 -> m CorpusId
158 flowDataText u (DataOld ids) tt cid = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids
159 where
160 corpusType = (Nothing :: Maybe HyperdataCorpus)
161 flowDataText u (DataNew txt) tt cid = flowCorpus u (Right [cid]) tt txt
162
163 ------------------------------------------------------------------------
164 -- TODO use proxy
165 flowAnnuaire :: FlowCmdM env err m
166 => User
167 -> Either CorpusName [CorpusId]
168 -> (TermType Lang)
169 -> FilePath
170 -> m AnnuaireId
171 flowAnnuaire u n l filePath = do
172 docs <- liftBase $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
173 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
174
175 ------------------------------------------------------------------------
176 flowCorpusFile :: FlowCmdM env err m
177 => User
178 -> Either CorpusName [CorpusId]
179 -> Limit -- Limit the number of docs (for dev purpose)
180 -> TermType Lang -> FileFormat -> FilePath
181 -> m CorpusId
182 flowCorpusFile u n l la ff fp = do
183 docs <- liftBase ( splitEvery 500
184 <$> take l
185 <$> parseFile ff fp
186 )
187 flowCorpus u n la (map (map toHyperdataDocument) docs)
188
189 ------------------------------------------------------------------------
190 -- | TODO improve the needed type to create/update a corpus
191 -- (For now, Either is enough)
192 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
193 => User
194 -> Either CorpusName [CorpusId]
195 -> TermType Lang
196 -> [[a]]
197 -> m CorpusId
198 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
199
200
201 flow :: (FlowCmdM env err m, FlowCorpus a, MkCorpus c)
202 => Maybe c
203 -> User
204 -> Either CorpusName [CorpusId]
205 -> TermType Lang
206 -> [[a]]
207 -> m CorpusId
208 flow c u cn la docs = do
209 let la' = tta2tt la
210 ids <- traverse (insertMasterDocs c la') docs
211 flowCorpusUser (la' ^. GTT.tt_lang) u cn c (concat ids)
212
213 ------------------------------------------------------------------------
214 flowCorpusUser :: (FlowCmdM env err m, MkCorpus c)
215 => Lang
216 -> User
217 -> Either CorpusName [CorpusId]
218 -> Maybe c
219 -> [NodeId]
220 -> m CorpusId
221 flowCorpusUser l user corpusName ctype ids = do
222 -- User Flow
223 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
224 listId <- getOrMkList userCorpusId userId
225 _cooc <- mkNode NodeListCooc listId userId
226 -- TODO: check if present already, ignore
227 _ <- Doc.add userCorpusId ids
228
229 _tId <- mkNode NodeTexts userCorpusId userId
230 -- printDebug "Node Text Id" tId
231
232 -- User List Flow
233 (masterUserId, _masterRootId, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
234 ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
235 _userListId <- flowList_DbRepo listId ngs
236 _mastListId <- getOrMkList masterCorpusId masterUserId
237 -- _ <- insertOccsUpdates userCorpusId mastListId
238 -- printDebug "userListId" userListId
239 -- User Graph Flow
240 _ <- mkDashboard userCorpusId userId
241 _ <- mkGraph userCorpusId userId
242 --_ <- mkPhylo userCorpusId userId
243
244 -- Annuaire Flow
245 -- _ <- mkAnnuaire rootUserId userId
246 pure userCorpusId
247
248
249 insertMasterDocs :: ( FlowCmdM env err m
250 , FlowCorpus a
251 , MkCorpus c
252 )
253 => Maybe c
254 -> GTT.TermType Lang
255 -> [a]
256 -> m [DocId]
257 insertMasterDocs c lang hs = do
258 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
259
260 -- TODO Type NodeDocumentUnicised
261 let docs = map addUniqId hs
262 ids <- insertDb masterUserId masterCorpusId docs
263 let
264 ids' = map reId ids
265 documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' docs)
266 -- TODO
267 -- create a corpus with database name (CSV or PubMed)
268 -- add documents to the corpus (create node_node link)
269 -- this will enable global database monitoring
270
271 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
272 maps <- mapNodeIdNgrams
273 <$> documentIdWithNgrams (extractNgramsT $ withLang lang documentsWithId) documentsWithId
274
275 terms2id <- insertNgrams $ Map.keys maps
276 -- to be removed
277 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) maps
278
279 -- new
280 lId <- getOrMkList masterCorpusId masterUserId
281 mapCgramsId <- listInsertDb lId toNodeNgramsW'
282 $ map (first _ngramsTerms . second Map.keys)
283 $ Map.toList maps
284 -- insertDocNgrams
285 _return <- insertNodeNodeNgrams2
286 $ catMaybes [ NodeNodeNgrams2 <$> Just nId
287 <*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms)
288 <*> Just (fromIntegral w :: Double)
289 | (terms, mapNgramsTypes) <- Map.toList maps
290 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
291 , (nId, w) <- Map.toList mapNodeIdWeight
292 ]
293
294 _ <- Doc.add masterCorpusId ids'
295 _cooc <- mkNode NodeListCooc lId masterUserId
296 -- to be removed
297 _ <- insertDocNgrams lId indexedNgrams
298
299 pure ids'
300
301
302 withLang :: HasText a
303 => GTT.TermType Lang
304 -> [DocumentWithId a]
305 -> GTT.TermType Lang
306 withLang (GTT.Unsupervised l n s m) ns = GTT.Unsupervised l n s m'
307 where
308 m' = case m of
309 Nothing -> trace ("buildTries here" :: String)
310 $ Just
311 $ buildTries n ( fmap toToken $ GTT.uniText
312 $ Text.intercalate " . "
313 $ List.concat
314 $ map hasText ns
315 )
316 just_m -> just_m
317 withLang l _ = l
318
319
320 ------------------------------------------------------------------------
321 viewUniqId' :: UniqId a
322 => a
323 -> (HashId, a)
324 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
325 where
326 err = panic "[ERROR] Database.Flow.toInsert"
327
328
329 toInserted :: [ReturnId]
330 -> Map HashId ReturnId
331 toInserted =
332 Map.fromList . map (\r -> (reUniqId r, r) )
333 . filter (\r -> reInserted r == True)
334
335 mergeData :: Map HashId ReturnId
336 -> Map HashId a
337 -> [DocumentWithId a]
338 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
339 where
340 toDocumentWithId (sha,hpd) =
341 DocumentWithId <$> fmap reId (lookup sha rs)
342 <*> Just hpd
343
344 ------------------------------------------------------------------------
345
346 instance HasText HyperdataContact
347 where
348 hasText = undefined
349
350 instance ExtractNgramsT HyperdataContact
351 where
352 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
353 where
354 extract :: GTT.TermType Lang -> HyperdataContact
355 -> Cmd err (Map Ngrams (Map NgramsType Int))
356 extract _l hc' = do
357 let authors = map text2ngrams
358 $ maybe ["Nothing"] (\a -> [a])
359 $ view (hc_who . _Just . cw_lastName) hc'
360
361 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
362
363 instance HasText HyperdataDocument
364 where
365 hasText h = catMaybes [ _hyperdataDocument_title h
366 , _hyperdataDocument_abstract h
367 ]
368
369 instance ExtractNgramsT HyperdataDocument
370 where
371 extractNgramsT :: GTT.TermType Lang
372 -> HyperdataDocument
373 -> Cmd err (Map Ngrams (Map NgramsType Int))
374 extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
375 where
376 extractNgramsT' :: GTT.TermType Lang
377 -> HyperdataDocument
378 -> Cmd err (Map Ngrams (Map NgramsType Int))
379 extractNgramsT' lang' doc = do
380 let source = text2ngrams
381 $ maybe "Nothing" identity
382 $ _hyperdataDocument_source doc
383
384 institutes = map text2ngrams
385 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
386 $ _hyperdataDocument_institutes doc
387
388 authors = map text2ngrams
389 $ maybe ["Nothing"] (splitOn ", ")
390 $ _hyperdataDocument_authors doc
391
392 terms' <- map text2ngrams
393 <$> map (intercalate " " . _terms_label)
394 <$> concat
395 <$> liftBase (GTT.extractTerms lang' $ hasText doc)
396
397 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
398 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
399 <> [(a', Map.singleton Authors 1) | a' <- authors ]
400 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
401
402 filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
403 -> Map Ngrams (Map NgramsType Int)
404 filterNgramsT s ms = Map.fromList $ map (\a -> filter' s a) $ Map.toList ms
405 where
406 filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
407 True -> (ng,y)
408 False -> (Ngrams (Text.take s' t) n , y)
409
410
411 documentIdWithNgrams :: HasNodeError err
412 => (a
413 -> Cmd err (Map Ngrams (Map NgramsType Int)))
414 -> [DocumentWithId a]
415 -> Cmd err [DocumentIdWithNgrams a]
416 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
417 where
418 toDocumentIdWithNgrams d = do
419 e <- f $ documentData d
420 pure $ DocumentIdWithNgrams d e
421