]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Node/Corpus/Export.hs
[RENAME] ScraperStatus -> JobLog
[gargantext.git] / src / Gargantext / API / Node / Corpus / Export.hs
1 {-|
2 Module : Gargantext.API.Node.Corpus.Export
3 Description : Get Metrics from Storage (Database like)
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Main exports of Gargantext:
11 - corpus
12 - document and ngrams
13 - lists
14 -}
15
16 {-# LANGUAGE TemplateHaskell #-}
17 {-# LANGUAGE TypeOperators #-}
18
19 module Gargantext.API.Node.Corpus.Export
20 where
21
22 import Data.Aeson.TH (deriveJSON)
23 import Data.Map (Map)
24 import Data.Maybe (fromMaybe)
25 import Data.Set (Set)
26 import Data.Swagger
27 import Data.Text (Text)
28 import GHC.Generics (Generic)
29 import Gargantext.API.Ngrams
30 import Gargantext.API.Ngrams.Tools (filterListWithRoot, mapTermListRoot, getRepo)
31 import Gargantext.API.Prelude (GargNoServer)
32 import Gargantext.Core.Types --
33 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
34 import Gargantext.Database.Action.Metrics.NgramsByNode (getNgramsByNodeOnlyUser)
35 import Gargantext.Database.Query.Table.Node
36 import Gargantext.Database.Query.Table.Node.Select (selectNodesWithUsername)
37 import Gargantext.Database.Admin.Config (userMaster)
38 import Gargantext.Database.Query.Table.Node.Error (HasNodeError)
39 import Gargantext.Database.Admin.Types.Node (Node, HyperdataDocument(..), NodeId, ListId, CorpusId)
40 import Gargantext.Database.Prelude (Cmd)
41 import Gargantext.Database.Schema.Node (_node_id, _node_hyperdata)
42 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
43 import Gargantext.Database.Query.Table.NodeNode (selectDocNodes)
44 import Gargantext.Prelude
45 import Gargantext.Prelude.Utils (sha)
46 import Servant
47 import qualified Data.List as List
48 import qualified Data.Map as Map
49 import qualified Data.Set as Set
50
51
52 -- Corpus Export
53 data Corpus =
54 Corpus { _c_corpus :: [Document]
55 , _c_hash :: Hash
56 } deriving (Generic)
57
58 -- | Document Export
59 data Document =
60 Document { _d_document :: Node HyperdataDocument
61 , _d_ngrams :: Ngrams
62 , _d_hash :: Hash
63 } deriving (Generic)
64
65 data Ngrams =
66 Ngrams { _ng_ngrams :: [Text]
67 , _ng_hash :: Hash
68 } deriving (Generic)
69
70 type Hash = Text
71 -------
72 instance ToSchema Corpus where
73 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_c_")
74
75 instance ToSchema Document where
76 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_d_")
77
78 instance ToSchema Ngrams where
79 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_ng_")
80
81 -------
82 instance ToParamSchema Corpus where
83 toParamSchema _ = toParamSchema (Proxy :: Proxy TODO)
84
85 instance ToParamSchema Document where
86 toParamSchema _ = toParamSchema (Proxy :: Proxy TODO)
87
88 instance ToParamSchema Ngrams where
89 toParamSchema _ = toParamSchema (Proxy :: Proxy TODO)
90 --------------------------------------------------
91 type API = Summary "Corpus Export"
92 :> "export"
93 :> QueryParam "listId" ListId
94 :> QueryParam "ngramsType" NgramsType
95 :> Get '[JSON] Corpus
96
97 --------------------------------------------------
98 getCorpus :: CorpusId
99 -> Maybe ListId
100 -> Maybe NgramsType
101 -> GargNoServer Corpus
102 getCorpus cId lId nt' = do
103
104 let
105 nt = case nt' of
106 Nothing -> NgramsTerms
107 Just t -> t
108
109 ns <- Map.fromList
110 <$> map (\n -> (_node_id n, n))
111 <$> selectDocNodes cId
112 repo <- getRepo
113 ngs <- getNodeNgrams cId lId nt repo
114 let -- uniqId is hash computed already for each document imported in database
115 r = Map.intersectionWith (\a b -> Document a (Ngrams (Set.toList b) (ng_hash b)) (d_hash a b)
116 ) ns ngs
117 where
118 ng_hash b = sha $ List.foldl (\x y -> x<>y) "" $ List.sort $ Set.toList b
119 d_hash a b = sha $ (fromMaybe "" (_hyperdataDocument_uniqId $ _node_hyperdata a))
120 <> (ng_hash b)
121
122 pure $ Corpus (Map.elems r) (sha $ List.foldl (\a b -> a<>b) ""
123 $ List.map _d_hash $ Map.elems r
124 )
125
126 getNodeNgrams :: HasNodeError err
127 => CorpusId
128 -> Maybe ListId
129 -> NgramsType
130 -> NgramsRepo
131 -> Cmd err (Map NodeId (Set Text))
132 getNodeNgrams cId lId' nt repo = do
133 lId <- case lId' of
134 Nothing -> defaultList cId
135 Just l -> pure l
136
137 lIds <- selectNodesWithUsername NodeList userMaster
138 let ngs = filterListWithRoot GraphTerm $ mapTermListRoot [lId] nt repo
139 r <- getNgramsByNodeOnlyUser cId (lIds <> [lId]) nt (Map.keys ngs)
140 pure r
141
142
143 $(deriveJSON (unPrefix "_c_") ''Corpus)
144 $(deriveJSON (unPrefix "_d_") ''Document)
145 $(deriveJSON (unPrefix "_ng_") ''Ngrams)
146
147
148 -- TODO
149 -- Exports List
150 -- Version number of the list
151
152