]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Corpus/Export.hs
[FACTORING] G.Text.Terms.
[gargantext.git] / src / Gargantext / API / Corpus / Export.hs
1 {-|
2 Module : Gargantext.API.Corpus.Export
3 Description : Get Metrics from Storage (Database like)
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Main exports of Gargantext:
11 - corpus
12 - document and ngrams
13 - lists
14 -}
15
16 {-# LANGUAGE DataKinds #-}
17 {-# LANGUAGE DeriveGeneric #-}
18 {-# LANGUAGE FlexibleContexts #-}
19 {-# LANGUAGE NoImplicitPrelude #-}
20 {-# LANGUAGE OverloadedStrings #-}
21 {-# LANGUAGE RankNTypes #-}
22 {-# LANGUAGE TemplateHaskell #-}
23 {-# LANGUAGE TypeOperators #-}
24
25 module Gargantext.API.Corpus.Export
26 where
27
28 import Data.Aeson.TH (deriveJSON)
29 import Data.Map (Map)
30 import Data.Maybe (fromMaybe)
31 import Data.Set (Set)
32 import Data.Swagger
33 import Data.Text (Text)
34 import GHC.Generics (Generic)
35 import Gargantext.API.Ngrams
36 import Gargantext.API.Ngrams.Tools (filterListWithRoot, mapTermListRoot, getRepo)
37 import Gargantext.API.Admin.Types (GargNoServer)
38 import Gargantext.Core.Types --
39 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
40 import Gargantext.Database.Action.Metrics.NgramsByNode (getNgramsByNodeOnlyUser)
41 import Gargantext.Database.Action.Query.Node
42 import Gargantext.Database.Action.Query.Node.Select (selectNodesWithUsername)
43 import Gargantext.Database.Admin.Config (userMaster)
44 import Gargantext.Database.Admin.Types.Errors (HasNodeError)
45 import Gargantext.Database.Admin.Types.Node (Node, HyperdataDocument(..), NodeId, ListId, CorpusId)
46 import Gargantext.Database.Admin.Utils (Cmd)
47 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
48 import Gargantext.Database.Schema.NodeNode (selectDocNodes)
49 import Gargantext.Prelude
50 import Gargantext.Prelude.Utils (sha)
51 import Servant
52 import qualified Data.List as List
53 import qualified Data.Map as Map
54 import qualified Data.Set as Set
55
56
57 -- Corpus Export
58 data Corpus =
59 Corpus { _c_corpus :: [Document]
60 , _c_hash :: Hash
61 } deriving (Generic)
62
63 -- | Document Export
64 data Document =
65 Document { _d_document :: Node HyperdataDocument
66 , _d_ngrams :: Ngrams
67 , _d_hash :: Hash
68 } deriving (Generic)
69
70 data Ngrams =
71 Ngrams { _ng_ngrams :: [Text]
72 , _ng_hash :: Hash
73 } deriving (Generic)
74
75 type Hash = Text
76 -------
77 instance ToSchema Corpus where
78 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_c_")
79
80 instance ToSchema Document where
81 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_d_")
82
83 instance ToSchema Ngrams where
84 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_ng_")
85
86 -------
87 instance ToParamSchema Corpus where
88 toParamSchema _ = toParamSchema (Proxy :: Proxy TODO)
89
90 instance ToParamSchema Document where
91 toParamSchema _ = toParamSchema (Proxy :: Proxy TODO)
92
93 instance ToParamSchema Ngrams where
94 toParamSchema _ = toParamSchema (Proxy :: Proxy TODO)
95 --------------------------------------------------
96 type API = Summary "Corpus Export"
97 :> "export"
98 :> QueryParam "listId" ListId
99 :> QueryParam "ngramsType" NgramsType
100 :> Get '[JSON] Corpus
101
102 --------------------------------------------------
103 getCorpus :: CorpusId
104 -> Maybe ListId
105 -> Maybe NgramsType
106 -> GargNoServer Corpus
107 getCorpus cId lId nt' = do
108
109 let
110 nt = case nt' of
111 Nothing -> NgramsTerms
112 Just t -> t
113
114 ns <- Map.fromList
115 <$> map (\n -> (_node_id n, n))
116 <$> selectDocNodes cId
117 repo <- getRepo
118 ngs <- getNodeNgrams cId lId nt repo
119 let -- uniqId is hash computed already for each document imported in database
120 r = Map.intersectionWith (\a b -> Document a (Ngrams (Set.toList b) (ng_hash b)) (d_hash a b)
121 ) ns ngs
122 where
123 ng_hash b = sha $ List.foldl (\x y -> x<>y) "" $ List.sort $ Set.toList b
124 d_hash a b = sha $ (fromMaybe "" (_hyperdataDocument_uniqId $ _node_hyperdata a))
125 <> (ng_hash b)
126
127 pure $ Corpus (Map.elems r) (sha $ List.foldl (\a b -> a<>b) ""
128 $ List.map _d_hash $ Map.elems r
129 )
130
131 getNodeNgrams :: HasNodeError err
132 => CorpusId
133 -> Maybe ListId
134 -> NgramsType
135 -> NgramsRepo
136 -> Cmd err (Map NodeId (Set Text))
137 getNodeNgrams cId lId' nt repo = do
138 lId <- case lId' of
139 Nothing -> defaultList cId
140 Just l -> pure l
141
142 lIds <- selectNodesWithUsername NodeList userMaster
143 let ngs = filterListWithRoot GraphTerm $ mapTermListRoot [lId] nt repo
144 r <- getNgramsByNodeOnlyUser cId (lIds <> [lId]) nt (Map.keys ngs)
145 pure r
146
147
148 $(deriveJSON (unPrefix "_c_") ''Corpus)
149 $(deriveJSON (unPrefix "_d_") ''Document)
150 $(deriveJSON (unPrefix "_ng_") ''Ngrams)
151
152
153 -- TODO
154 -- Exports List
155 -- Version number of the list
156
157