]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Ngrams/List.hs
[list] fixes to list CSV upload
[gargantext.git] / src / Gargantext / API / Ngrams / List.hs
1 {-|
2 Module : Gargantext.API.Ngrams.List
3 Description : Get Ngrams (lists)
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE TemplateHaskell #-}
13 {-# LANGUAGE TypeOperators #-}
14
15 module Gargantext.API.Ngrams.List
16 where
17
18 import Control.Lens hiding (elements, Indexed)
19 import Data.Either (Either(..))
20 import Data.HashMap.Strict (HashMap)
21 import Data.Map.Strict (Map, toList)
22 import Data.Maybe (catMaybes, fromMaybe)
23 import Data.Set (Set)
24 import Data.Text (Text, concat, pack, splitOn)
25 import Data.Vector (Vector)
26 import Gargantext.API.Admin.EnvTypes (Env, GargJob(..))
27 import Gargantext.API.Admin.Orchestrator.Types
28 import Gargantext.API.Job (jobLogFailTotalWithMessage, jobLogSuccess)
29 import Gargantext.API.Ngrams (setListNgrams)
30 import Gargantext.API.Ngrams.List.Types
31 import Gargantext.API.Ngrams.Prelude (getNgramsList)
32 import Gargantext.API.Ngrams.Tools (getTermsWith)
33 import Gargantext.API.Ngrams.Types
34 import Gargantext.API.Prelude (GargServer, GargM, GargError)
35 import Gargantext.API.Types
36 import Gargantext.Core.NodeStory
37 import Gargantext.Core.Text.Terms (ExtractedNgrams(..))
38 import Gargantext.Core.Text.Terms.WithList (buildPatterns, termsInText)
39 import Gargantext.Core.Types.Main (ListType(..))
40 import Gargantext.Database.Action.Flow (saveDocNgramsWith)
41 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
42 import Gargantext.Database.Admin.Types.Hyperdata.Document
43 import Gargantext.Database.Admin.Types.Node
44 import Gargantext.Database.Query.Table.Node (getNode)
45 import Gargantext.Database.Query.Table.NodeContext (selectDocNodes)
46 import Gargantext.Database.Schema.Context
47 import Gargantext.Database.Schema.Ngrams
48 import Gargantext.Database.Schema.Node (_node_parent_id)
49 import Gargantext.Database.Types (Indexed(..))
50 import Gargantext.Prelude
51 import Gargantext.Utils.Jobs (serveJobsAPI, jobHandleLogger)
52 import Servant
53 -- import Servant.Job.Async
54 import qualified Data.ByteString.Lazy as BSL
55 import qualified Data.Csv as Csv
56 import qualified Data.HashMap.Strict as HashMap
57 import qualified Data.List as List
58 import qualified Data.Map.Strict as Map
59 import qualified Data.Set as Set
60 import qualified Data.Text as Text
61 import qualified Data.Vector as Vec
62 import qualified Gargantext.Database.Query.Table.Ngrams as TableNgrams
63 import qualified Gargantext.Utils.Servant as GUS
64 import qualified Prelude
65 import qualified Protolude as P
66 ------------------------------------------------------------------------
67 type GETAPI = Summary "Get List"
68 :> "lists"
69 :> Capture "listId" ListId
70 :> "json"
71 :> Get '[JSON, HTML] (Headers '[Header "Content-Disposition" Text] NgramsList)
72 :<|> "lists"
73 :> Capture "listId" ListId
74 :> "csv"
75 :> Get '[GUS.CSV] (Headers '[Header "Content-Disposition" Text] NgramsTableMap)
76 getApi :: GargServer GETAPI
77 getApi = getJson :<|> getCsv
78
79 ----------------------
80 type JSONAPI = Summary "Update List"
81 :> "lists"
82 :> Capture "listId" ListId
83 :> "add"
84 :> "form"
85 :> "async"
86 :> AsyncJobs JobLog '[FormUrlEncoded] WithJsonFile JobLog
87
88 jsonApi :: ServerT JSONAPI (GargM Env GargError)
89 jsonApi = jsonPostAsync
90
91 ----------------------
92 type CSVAPI = Summary "Update List (legacy v3 CSV)"
93 :> "lists"
94 :> Capture "listId" ListId
95 :> "csv"
96 :> "add"
97 :> "form"
98 :> "async"
99 :> AsyncJobs JobLog '[FormUrlEncoded] WithTextFile JobLog
100
101 csvApi :: ServerT CSVAPI (GargM Env GargError)
102 csvApi = csvPostAsync
103
104 ------------------------------------------------------------------------
105 getJson :: HasNodeStory env err m =>
106 ListId -> m (Headers '[Header "Content-Disposition" Text] NgramsList)
107 getJson lId = do
108 lst <- getNgramsList lId
109 let (NodeId id') = lId
110 return $ addHeader (concat [ "attachment; filename=GarganText_NgramsList-"
111 , pack $ show id'
112 , ".json"
113 ]
114 ) lst
115
116 getCsv :: HasNodeStory env err m =>
117 ListId -> m (Headers '[Header "Content-Disposition" Text] NgramsTableMap)
118 getCsv lId = do
119 lst <- getNgramsList lId
120 let (NodeId id') = lId
121 return $ case Map.lookup TableNgrams.NgramsTerms lst of
122 Nothing -> noHeader Map.empty
123 Just (Versioned { _v_data }) ->
124 addHeader (concat [ "attachment; filename=GarganText_NgramsList-"
125 , pack $ show id'
126 , ".csv"
127 ]
128 ) _v_data
129
130 ------------------------------------------------------------------------
131 -- TODO : purge list
132 -- TODO talk
133 setList :: FlowCmdM env err m
134 => ListId
135 -> NgramsList
136 -> m Bool
137 setList l m = do
138 -- TODO check with Version for optim
139 -- printDebug "New list as file" l
140 _ <- mapM (\(nt, Versioned _v ns) -> setListNgrams l nt ns) $ toList m
141 -- TODO reindex
142 pure True
143
144 ------------------------------------------------------------------------
145 -- | Re-index documents of a corpus with new ngrams (called orphans here)
146 reIndexWith :: ( HasNodeStory env err m
147 , FlowCmdM env err m
148 )
149 => CorpusId
150 -> ListId
151 -> NgramsType
152 -> Set ListType
153 -> m ()
154 reIndexWith cId lId nt lts = do
155 -- printDebug "(cId,lId,nt,lts)" (cId, lId, nt, lts)
156
157 -- Getting [NgramsTerm]
158 ts <- List.concat
159 <$> map (\(k,vs) -> k:vs)
160 <$> HashMap.toList
161 <$> getTermsWith identity [lId] nt lts
162
163 -- Get all documents of the corpus
164 docs <- selectDocNodes cId
165
166 let
167 -- fromListWith (<>)
168 ngramsByDoc = map (HashMap.fromListWith (Map.unionWith (Map.unionWith (\(_a,b) (_a',b') -> (1,b+b')))))
169 $ map (map (\((k, cnt), v) -> (SimpleNgrams (text2ngrams k), over (traverse . traverse) (\p -> (p, cnt)) v)))
170 $ map (\doc -> List.zip
171 (termsInText (buildPatterns $ map (\k -> (Text.splitOn " " $ unNgramsTerm k, [])) ts)
172 $ Text.unlines $ catMaybes
173 [ doc ^. context_hyperdata . hd_title
174 , doc ^. context_hyperdata . hd_abstract
175 ]
176 )
177 (List.cycle [Map.fromList $ [(nt, Map.singleton (doc ^. context_id) 1 )]])
178 ) docs
179
180 -- printDebug "ngramsByDoc: " ngramsByDoc
181
182 -- Saving the indexation in database
183 _ <- mapM (saveDocNgramsWith lId) ngramsByDoc
184
185 pure ()
186
187 toIndexedNgrams :: HashMap Text NgramsId -> Text -> Maybe (Indexed Int Ngrams)
188 toIndexedNgrams m t = Indexed <$> i <*> n
189 where
190 i = HashMap.lookup t m
191 n = Just (text2ngrams t)
192
193 ------------------------------------------------------------------------
194 jsonPostAsync :: ServerT JSONAPI (GargM Env GargError)
195 jsonPostAsync lId =
196 serveJobsAPI UpdateNgramsListJobJSON $ \jHandle f ->
197 let
198 log'' x = do
199 -- printDebug "postAsync ListId" x
200 jobHandleLogger jHandle x
201 in postAsync' lId f log''
202
203 postAsync' :: FlowCmdM env err m
204 => ListId
205 -> WithJsonFile
206 -> (JobLog -> m ())
207 -> m JobLog
208 postAsync' l (WithJsonFile m _) logStatus = do
209
210 logStatus JobLog { _scst_succeeded = Just 0
211 , _scst_failed = Just 0
212 , _scst_remaining = Just 2
213 , _scst_events = Just []
214 }
215 -- printDebug "New list as file" l
216 _ <- setList l m
217 -- printDebug "Done" r
218
219 logStatus JobLog { _scst_succeeded = Just 1
220 , _scst_failed = Just 0
221 , _scst_remaining = Just 1
222 , _scst_events = Just []
223 }
224
225
226 corpus_node <- getNode l -- (Proxy :: Proxy HyperdataList)
227 let corpus_id = fromMaybe (panic "") (_node_parent_id corpus_node)
228 _ <- reIndexWith corpus_id l NgramsTerms (Set.fromList [MapTerm, CandidateTerm])
229
230 pure JobLog { _scst_succeeded = Just 2
231 , _scst_failed = Just 0
232 , _scst_remaining = Just 0
233 , _scst_events = Just []
234 }
235
236
237 ------------------------------------------------------------------------
238
239 readCsvText :: Text -> Either Text [(Text, Text, Text)]
240 readCsvText t = case eDec of
241 Left err -> Left $ pack err
242 Right dec -> Right $ Vec.toList dec
243 where
244 lt = BSL.fromStrict $ P.encodeUtf8 t
245 eDec = Csv.decodeWith
246 (Csv.defaultDecodeOptions { Csv.decDelimiter = fromIntegral (P.ord '\t') })
247 Csv.HasHeader lt :: Either Prelude.String (Vector (Text, Text, Text))
248
249 parseCsvData :: [(Text, Text, Text)] -> Map NgramsTerm NgramsRepoElement
250 parseCsvData lst = Map.fromList $ conv <$> lst
251 where
252 conv (status, label, forms) =
253 (NgramsTerm label, NgramsRepoElement { _nre_size = 1
254 , _nre_list = case status == "map" of
255 True -> MapTerm
256 False -> case status == "main" of
257 True -> CandidateTerm
258 False -> StopTerm
259 , _nre_root = Nothing
260 , _nre_parent = Nothing
261 , _nre_children = MSet
262 $ Map.fromList
263 $ map (\form -> (NgramsTerm form, ()))
264 $ filter (\w -> w /= "" && w /= label)
265 $ splitOn "|&|" forms
266 }
267 )
268
269 csvPost :: FlowCmdM env err m
270 => ListId
271 -> Text
272 -> m (Either Text ())
273 csvPost l m = do
274 -- printDebug "[csvPost] l" l
275 -- printDebug "[csvPost] m" m
276 -- status label forms
277 let eLst = readCsvText m
278 case eLst of
279 Left err -> pure $ Left err
280 Right lst -> do
281 let p = parseCsvData lst
282 --printDebug "[csvPost] lst" lst
283 -- printDebug "[csvPost] p" p
284 _ <- setListNgrams l NgramsTerms p
285 -- printDebug "ReIndexing List" l
286 corpus_node <- getNode l -- (Proxy :: Proxy HyperdataList)
287 let corpus_id = fromMaybe (panic "") (_node_parent_id corpus_node)
288 _ <- reIndexWith corpus_id l NgramsTerms (Set.fromList [MapTerm, CandidateTerm])
289
290 pure $ Right ()
291
292 ------------------------------------------------------------------------
293 csvPostAsync :: ServerT CSVAPI (GargM Env GargError)
294 csvPostAsync lId =
295 serveJobsAPI UpdateNgramsListJobCSV $ \jHandle f@(WithTextFile _ft _ _n) -> do
296 let log'' x = do
297 -- printDebug "[csvPostAsync] filetype" ft
298 -- printDebug "[csvPostAsync] name" n
299 jobHandleLogger jHandle x
300 jl <- csvPostAsync' lId f log''
301 printDebug "[csvPostAsync] job ended with joblog: " jl
302 log'' jl
303 pure jl
304
305
306 csvPostAsync' :: FlowCmdM env err m
307 => ListId
308 -> WithTextFile
309 -> (JobLog -> m ())
310 -> m JobLog
311 csvPostAsync' l (WithTextFile _ m _) logStatus = do
312 let jl = JobLog { _scst_succeeded = Just 0
313 , _scst_failed = Just 0
314 , _scst_remaining = Just 1
315 , _scst_events = Just []
316 }
317 logStatus jl
318 ePost <- csvPost l m
319 case ePost of
320 Left err -> pure $ jobLogFailTotalWithMessage err jl
321 Right () -> pure $ jobLogSuccess jl
322 ------------------------------------------------------------------------
323
324
325 -- | This is for debugging the CSV parser in the REPL
326 importCsvFile :: FlowCmdM env err m
327 => ListId -> P.FilePath -> m (Either Text ())
328 importCsvFile lId fp = do
329 contents <- liftBase $ P.readFile fp
330 csvPost lId contents