]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Ngrams/List.hs
Pass a JobHandle to the serveJobsAPI continuation
[gargantext.git] / src / Gargantext / API / Ngrams / List.hs
1 {-|
2 Module : Gargantext.API.Ngrams.List
3 Description : Get Ngrams (lists)
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE TemplateHaskell #-}
13 {-# LANGUAGE TypeOperators #-}
14
15 module Gargantext.API.Ngrams.List
16 where
17
18 import Control.Lens hiding (elements, Indexed)
19 import Data.Either (Either(..))
20 import Data.HashMap.Strict (HashMap)
21 import Data.Map.Strict (Map, toList)
22 import Data.Maybe (catMaybes, fromMaybe)
23 import Data.Set (Set)
24 import Data.Text (Text, concat, pack, splitOn)
25 import Data.Vector (Vector)
26 import Gargantext.API.Admin.EnvTypes (Env, GargJob(..))
27 import Gargantext.API.Admin.Orchestrator.Types
28 import Gargantext.API.Ngrams (setListNgrams)
29 import Gargantext.API.Ngrams.List.Types
30 import Gargantext.API.Ngrams.Prelude (getNgramsList)
31 import Gargantext.API.Ngrams.Tools (getTermsWith)
32 import Gargantext.API.Ngrams.Types
33 import Gargantext.API.Prelude (GargServer, GargM, GargError)
34 import Gargantext.API.Types
35 import Gargantext.Core.NodeStory
36 import Gargantext.Core.Text.Terms (ExtractedNgrams(..))
37 import Gargantext.Core.Text.Terms.WithList (buildPatterns, termsInText)
38 import Gargantext.Core.Types.Main (ListType(..))
39 import Gargantext.Database.Action.Flow (saveDocNgramsWith)
40 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
41 import Gargantext.Database.Admin.Types.Hyperdata.Document
42 import Gargantext.Database.Admin.Types.Node
43 import Gargantext.Database.Query.Table.Node (getNode)
44 import Gargantext.Database.Query.Table.NodeContext (selectDocNodes)
45 import Gargantext.Database.Schema.Context
46 import Gargantext.Database.Schema.Ngrams
47 import Gargantext.Database.Schema.Node (_node_parent_id)
48 import Gargantext.Database.Types (Indexed(..))
49 import Gargantext.Prelude
50 import Gargantext.Utils.Jobs (serveJobsAPI)
51 import Servant
52 -- import Servant.Job.Async
53 import qualified Data.ByteString.Lazy as BSL
54 import qualified Data.Csv as Csv
55 import qualified Data.HashMap.Strict as HashMap
56 import qualified Data.List as List
57 import qualified Data.Map.Strict as Map
58 import qualified Data.Set as Set
59 import qualified Data.Text as Text
60 import qualified Data.Vector as Vec
61 import qualified Gargantext.Database.Query.Table.Ngrams as TableNgrams
62 import qualified Gargantext.Utils.Servant as GUS
63 import qualified Prelude
64 import qualified Protolude as P
65 ------------------------------------------------------------------------
66 type GETAPI = Summary "Get List"
67 :> "lists"
68 :> Capture "listId" ListId
69 :> "json"
70 :> Get '[JSON, HTML] (Headers '[Header "Content-Disposition" Text] NgramsList)
71 :<|> "lists"
72 :> Capture "listId" ListId
73 :> "csv"
74 :> Get '[GUS.CSV] (Headers '[Header "Content-Disposition" Text] NgramsTableMap)
75 getApi :: GargServer GETAPI
76 getApi = getJson :<|> getCsv
77
78 ----------------------
79 type JSONAPI = Summary "Update List"
80 :> "lists"
81 :> Capture "listId" ListId
82 :> "add"
83 :> "form"
84 :> "async"
85 :> AsyncJobs JobLog '[FormUrlEncoded] WithJsonFile JobLog
86
87 jsonApi :: ServerT JSONAPI (GargM Env GargError)
88 jsonApi = jsonPostAsync
89
90 ----------------------
91 type CSVAPI = Summary "Update List (legacy v3 CSV)"
92 :> "lists"
93 :> Capture "listId" ListId
94 :> "csv"
95 :> "add"
96 :> "form"
97 :> "async"
98 :> AsyncJobs JobLog '[FormUrlEncoded] WithTextFile JobLog
99
100 csvApi :: ServerT CSVAPI (GargM Env GargError)
101 csvApi = csvPostAsync
102
103 ------------------------------------------------------------------------
104 getJson :: HasNodeStory env err m =>
105 ListId -> m (Headers '[Header "Content-Disposition" Text] NgramsList)
106 getJson lId = do
107 lst <- getNgramsList lId
108 let (NodeId id') = lId
109 return $ addHeader (concat [ "attachment; filename=GarganText_NgramsList-"
110 , pack $ show id'
111 , ".json"
112 ]
113 ) lst
114
115 getCsv :: HasNodeStory env err m =>
116 ListId -> m (Headers '[Header "Content-Disposition" Text] NgramsTableMap)
117 getCsv lId = do
118 lst <- getNgramsList lId
119 let (NodeId id') = lId
120 return $ case Map.lookup TableNgrams.NgramsTerms lst of
121 Nothing -> noHeader Map.empty
122 Just (Versioned { _v_data }) ->
123 addHeader (concat [ "attachment; filename=GarganText_NgramsList-"
124 , pack $ show id'
125 , ".csv"
126 ]
127 ) _v_data
128
129 ------------------------------------------------------------------------
130 -- TODO : purge list
131 -- TODO talk
132 setList :: FlowCmdM env err m
133 => ListId
134 -> NgramsList
135 -> m Bool
136 setList l m = do
137 -- TODO check with Version for optim
138 -- printDebug "New list as file" l
139 _ <- mapM (\(nt, Versioned _v ns) -> setListNgrams l nt ns) $ toList m
140 -- TODO reindex
141 pure True
142
143 ------------------------------------------------------------------------
144 -- | Re-index documents of a corpus with new ngrams (called orphans here)
145 reIndexWith :: ( HasNodeStory env err m
146 , FlowCmdM env err m
147 )
148 => CorpusId
149 -> ListId
150 -> NgramsType
151 -> Set ListType
152 -> m ()
153 reIndexWith cId lId nt lts = do
154 -- printDebug "(cId,lId,nt,lts)" (cId, lId, nt, lts)
155
156 -- Getting [NgramsTerm]
157 ts <- List.concat
158 <$> map (\(k,vs) -> k:vs)
159 <$> HashMap.toList
160 <$> getTermsWith identity [lId] nt lts
161
162 -- Get all documents of the corpus
163 docs <- selectDocNodes cId
164
165 let
166 -- fromListWith (<>)
167 ngramsByDoc = map (HashMap.fromListWith (Map.unionWith (Map.unionWith (\(_a,b) (_a',b') -> (1,b+b')))))
168 $ map (map (\((k, cnt), v) -> (SimpleNgrams (text2ngrams k), over (traverse . traverse) (\p -> (p, cnt)) v)))
169 $ map (\doc -> List.zip
170 (termsInText (buildPatterns $ map (\k -> (Text.splitOn " " $ unNgramsTerm k, [])) ts)
171 $ Text.unlines $ catMaybes
172 [ doc ^. context_hyperdata . hd_title
173 , doc ^. context_hyperdata . hd_abstract
174 ]
175 )
176 (List.cycle [Map.fromList $ [(nt, Map.singleton (doc ^. context_id) 1 )]])
177 ) docs
178
179 -- printDebug "ngramsByDoc: " ngramsByDoc
180
181 -- Saving the indexation in database
182 _ <- mapM (saveDocNgramsWith lId) ngramsByDoc
183
184 pure ()
185
186 toIndexedNgrams :: HashMap Text NgramsId -> Text -> Maybe (Indexed Int Ngrams)
187 toIndexedNgrams m t = Indexed <$> i <*> n
188 where
189 i = HashMap.lookup t m
190 n = Just (text2ngrams t)
191
192 ------------------------------------------------------------------------
193 jsonPostAsync :: ServerT JSONAPI (GargM Env GargError)
194 jsonPostAsync lId =
195 serveJobsAPI UpdateNgramsListJobJSON $ \_jHandle f log' ->
196 let
197 log'' x = do
198 -- printDebug "postAsync ListId" x
199 liftBase $ log' x
200 in postAsync' lId f log''
201
202 postAsync' :: FlowCmdM env err m
203 => ListId
204 -> WithJsonFile
205 -> (JobLog -> m ())
206 -> m JobLog
207 postAsync' l (WithJsonFile m _) logStatus = do
208
209 logStatus JobLog { _scst_succeeded = Just 0
210 , _scst_failed = Just 0
211 , _scst_remaining = Just 2
212 , _scst_events = Just []
213 }
214 -- printDebug "New list as file" l
215 _ <- setList l m
216 -- printDebug "Done" r
217
218 logStatus JobLog { _scst_succeeded = Just 1
219 , _scst_failed = Just 0
220 , _scst_remaining = Just 1
221 , _scst_events = Just []
222 }
223
224
225 corpus_node <- getNode l -- (Proxy :: Proxy HyperdataList)
226 let corpus_id = fromMaybe (panic "") (_node_parent_id corpus_node)
227 _ <- reIndexWith corpus_id l NgramsTerms (Set.fromList [MapTerm, CandidateTerm])
228
229 pure JobLog { _scst_succeeded = Just 2
230 , _scst_failed = Just 0
231 , _scst_remaining = Just 0
232 , _scst_events = Just []
233 }
234
235
236 ------------------------------------------------------------------------
237
238 readCsvText :: Text -> [(Text, Text, Text)]
239 readCsvText t = case eDec of
240 Left _ -> []
241 Right dec -> Vec.toList dec
242 where
243 lt = BSL.fromStrict $ P.encodeUtf8 t
244 eDec = Csv.decodeWith
245 (Csv.defaultDecodeOptions { Csv.decDelimiter = fromIntegral (P.ord '\t') })
246 Csv.HasHeader lt :: Either Prelude.String (Vector (Text, Text, Text))
247
248 parseCsvData :: [(Text, Text, Text)] -> Map NgramsTerm NgramsRepoElement
249 parseCsvData lst = Map.fromList $ conv <$> lst
250 where
251 conv (status, label, forms) =
252 (NgramsTerm label, NgramsRepoElement { _nre_size = 1
253 , _nre_list = case status == "map" of
254 True -> MapTerm
255 False -> case status == "main" of
256 True -> CandidateTerm
257 False -> StopTerm
258 , _nre_root = Nothing
259 , _nre_parent = Nothing
260 , _nre_children = MSet
261 $ Map.fromList
262 $ map (\form -> (NgramsTerm form, ()))
263 $ filter (\w -> w /= "" && w /= label)
264 $ splitOn "|&|" forms
265 }
266 )
267
268 csvPost :: FlowCmdM env err m
269 => ListId
270 -> Text
271 -> m Bool
272 csvPost l m = do
273 -- printDebug "[csvPost] l" l
274 -- printDebug "[csvPost] m" m
275 -- status label forms
276 let lst = readCsvText m
277 let p = parseCsvData lst
278 --printDebug "[csvPost] lst" lst
279 -- printDebug "[csvPost] p" p
280 _ <- setListNgrams l NgramsTerms p
281 -- printDebug "ReIndexing List" l
282 corpus_node <- getNode l -- (Proxy :: Proxy HyperdataList)
283 let corpus_id = fromMaybe (panic "") (_node_parent_id corpus_node)
284 _ <- reIndexWith corpus_id l NgramsTerms (Set.fromList [MapTerm, CandidateTerm])
285
286 pure True
287
288 ------------------------------------------------------------------------
289 csvPostAsync :: ServerT CSVAPI (GargM Env GargError)
290 csvPostAsync lId =
291 serveJobsAPI UpdateNgramsListJobCSV $ \_jHandle f@(WithTextFile _ft _ _n) log' -> do
292 let log'' x = do
293 -- printDebug "[csvPostAsync] filetype" ft
294 -- printDebug "[csvPostAsync] name" n
295 liftBase $ log' x
296 csvPostAsync' lId f log''
297
298
299 csvPostAsync' :: FlowCmdM env err m
300 => ListId
301 -> WithTextFile
302 -> (JobLog -> m ())
303 -> m JobLog
304 csvPostAsync' l (WithTextFile _ m _) logStatus = do
305 logStatus JobLog { _scst_succeeded = Just 0
306 , _scst_failed = Just 0
307 , _scst_remaining = Just 1
308 , _scst_events = Just []
309 }
310 _r <- csvPost l m
311
312 pure JobLog { _scst_succeeded = Just 1
313 , _scst_failed = Just 0
314 , _scst_remaining = Just 0
315 , _scst_events = Just []
316 }
317 ------------------------------------------------------------------------