]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Ngrams/List.hs
[FIX] V3 Ngrams List post/update with CSV format
[gargantext.git] / src / Gargantext / API / Ngrams / List.hs
1 {-|
2 Module : Gargantext.API.Ngrams.List
3 Description : Get Ngrams (lists)
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE TemplateHaskell #-}
13 {-# LANGUAGE TypeOperators #-}
14
15 module Gargantext.API.Ngrams.List
16 where
17
18 import Control.Lens hiding (elements, Indexed)
19 import Data.Aeson
20 import Data.Either (Either(..))
21 import Data.HashMap.Strict (HashMap)
22 import Data.Map (Map, toList)
23 import Data.Maybe (catMaybes)
24 import Data.Set (Set)
25 import Data.Text (Text, concat, pack, splitOn)
26 import Data.Vector (Vector)
27 import Gargantext.API.Admin.Orchestrator.Types
28 import Gargantext.API.Ngrams (setListNgrams)
29 import Gargantext.API.Ngrams.List.Types
30 import Gargantext.API.Ngrams.Prelude (getNgramsList)
31 import Gargantext.API.Ngrams.Tools (getTermsWith)
32 import Gargantext.API.Ngrams.Types
33 import Gargantext.API.Prelude (GargServer)
34 import Gargantext.Core.NodeStory
35 import Gargantext.Core.Text.Terms (ExtractedNgrams(..))
36 import Gargantext.Core.Text.Terms.WithList (buildPatterns, termsInText)
37 import Gargantext.Core.Types.Main (ListType(..))
38 import Gargantext.Database.Action.Flow (saveDocNgramsWith)
39 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
40 import Gargantext.Database.Action.Metrics.NgramsByContext (getOccByNgramsOnlyFast')
41 import Gargantext.Database.Admin.Types.Hyperdata.Document
42 import Gargantext.Database.Admin.Types.Node
43 import Gargantext.Database.Query.Table.NodeContext (selectDocNodes)
44 import Gargantext.Database.Schema.Context
45 import Gargantext.Database.Schema.Ngrams
46 import Gargantext.Database.Types (Indexed(..))
47 import Gargantext.Prelude
48 import Network.HTTP.Media ((//), (/:))
49 import Servant
50 import Servant.Job.Async
51 import qualified Data.ByteString.Lazy as BSL
52 import qualified Data.Csv as Csv
53 import qualified Data.HashMap.Strict as HashMap
54 import qualified Data.List as List
55 import qualified Data.Map as Map
56 import qualified Data.Text as Text
57 import qualified Data.Vector as Vec
58 import qualified Prelude as Prelude
59 import qualified Protolude as P
60 ------------------------------------------------------------------------
61 type GETAPI = Summary "Get List"
62 :> "lists"
63 :> Capture "listId" ListId
64 :> Get '[JSON, HTML] (Headers '[Header "Content-Disposition" Text] NgramsList)
65 getApi :: GargServer GETAPI
66 getApi = get
67
68 data HTML
69 instance Accept HTML where
70 contentType _ = "text" // "html" /: ("charset", "utf-8")
71 instance ToJSON a => MimeRender HTML a where
72 mimeRender _ = encode
73
74 ----------------------
75 type JSONAPI = Summary "Update List"
76 :> "lists"
77 :> Capture "listId" ListId
78 :> "add"
79 :> "form"
80 :> "async"
81 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
82
83 jsonApi :: GargServer JSONAPI
84 jsonApi = postAsync
85
86 ----------------------
87 type CSVAPI = Summary "Update List (legacy v3 CSV)"
88 :> "lists"
89 :> Capture "listId" ListId
90 :> "csv"
91 :> "add"
92 :> "form"
93 :> "async"
94 :> AsyncJobs JobLog '[FormUrlEncoded] WithTextFile JobLog
95
96 csvApi :: GargServer CSVAPI
97 csvApi = csvPostAsync
98
99 ------------------------------------------------------------------------
100 get :: HasNodeStory env err m =>
101 ListId -> m (Headers '[Header "Content-Disposition" Text] NgramsList)
102 get lId = do
103 lst <- getNgramsList lId
104 let (NodeId id') = lId
105 return $ addHeader (concat [ "attachment; filename=GarganText_NgramsList-"
106 , pack $ show id'
107 , ".json"
108 ]
109 ) lst
110
111 ------------------------------------------------------------------------
112 -- TODO : purge list
113 -- TODO talk
114 setList :: FlowCmdM env err m
115 => ListId
116 -> NgramsList
117 -> m Bool
118 setList l m = do
119 -- TODO check with Version for optim
120 printDebug "New list as file" l
121 _ <- mapM (\(nt, Versioned _v ns) -> setListNgrams l nt ns) $ toList m
122 -- TODO reindex
123 pure True
124
125 ------------------------------------------------------------------------
126 -- | Re-index documents of a corpus with new ngrams (called orphans here)
127 reIndexWith :: ( HasNodeStory env err m
128 , FlowCmdM env err m
129 )
130 => CorpusId
131 -> ListId
132 -> NgramsType
133 -> Set ListType
134 -> m ()
135 reIndexWith cId lId nt lts = do
136 -- Getting [NgramsTerm]
137 ts <- List.concat
138 <$> map (\(k,vs) -> k:vs)
139 <$> HashMap.toList
140 <$> getTermsWith identity [lId] nt lts
141
142 printDebug "ts" ts
143
144 -- Taking the ngrams with 0 occurrences only (orphans)
145 occs <- getOccByNgramsOnlyFast' cId lId nt ts
146
147 printDebug "occs" occs
148
149 let orphans = List.concat
150 $ map (\t -> case HashMap.lookup t occs of
151 Nothing -> [t]
152 Just n -> if n <= 1 then [t] else [ ]
153 ) ts
154
155 printDebug "orphans" orphans
156
157 -- Get all documents of the corpus
158 docs <- selectDocNodes cId
159 printDebug "docs length" (List.length docs)
160
161 -- Checking Text documents where orphans match
162 -- TODO Tests here
163 let
164 ngramsByDoc = map (HashMap.fromList)
165 $ map (map (\(k,v) -> (SimpleNgrams (text2ngrams k), v)))
166 $ map (\doc -> List.zip
167 (termsInText (buildPatterns $ map (\k -> (Text.splitOn " " $ unNgramsTerm k, [])) orphans)
168 $ Text.unlines $ catMaybes
169 [ doc ^. context_hyperdata . hd_title
170 , doc ^. context_hyperdata . hd_abstract
171 ]
172 )
173 (List.cycle [Map.fromList $ [(nt, Map.singleton (doc ^. context_id) 1 )]])
174 ) docs
175
176 printDebug "ngramsByDoc" ngramsByDoc
177
178 -- Saving the indexation in database
179 _ <- mapM (saveDocNgramsWith lId) ngramsByDoc
180
181 pure () -- ngramsByDoc
182
183 toIndexedNgrams :: HashMap Text NgramsId -> Text -> Maybe (Indexed Int Ngrams)
184 toIndexedNgrams m t = Indexed <$> i <*> n
185 where
186 i = HashMap.lookup t m
187 n = Just (text2ngrams t)
188
189 ------------------------------------------------------------------------
190 type PostAPI = Summary "Update List"
191 :> "add"
192 :> "form"
193 :> "async"
194 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
195
196 postAsync :: GargServer JSONAPI
197 postAsync lId =
198 serveJobsAPI $
199 JobFunction (\f log' ->
200 let
201 log'' x = do
202 printDebug "postAsync ListId" x
203 liftBase $ log' x
204 in postAsync' lId f log'')
205
206 postAsync' :: FlowCmdM env err m
207 => ListId
208 -> WithFile
209 -> (JobLog -> m ())
210 -> m JobLog
211 postAsync' l (WithFile _ m _) logStatus = do
212
213 logStatus JobLog { _scst_succeeded = Just 0
214 , _scst_failed = Just 0
215 , _scst_remaining = Just 1
216 , _scst_events = Just []
217 }
218 printDebug "New list as file" l
219 _ <- setList l m
220 -- printDebug "Done" r
221
222 pure JobLog { _scst_succeeded = Just 1
223 , _scst_failed = Just 0
224 , _scst_remaining = Just 0
225 , _scst_events = Just []
226 }
227 ------------------------------------------------------------------------
228 type CSVPostAPI = Summary "Update List (legacy v3 CSV)"
229 :> "csv"
230 :> "add"
231 :> "form"
232 :> "async"
233 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
234
235 readCsvText :: Text -> [(Text, Text, Text)]
236 readCsvText t = case eDec of
237 Left _ -> []
238 Right dec -> Vec.toList dec
239 where
240 lt = BSL.fromStrict $ P.encodeUtf8 t
241 eDec = Csv.decodeWith
242 (Csv.defaultDecodeOptions { Csv.decDelimiter = fromIntegral (P.ord '\t') })
243 Csv.HasHeader lt :: Either Prelude.String (Vector (Text, Text, Text))
244
245 parseCsvData :: [(Text, Text, Text)] -> Map NgramsTerm NgramsRepoElement
246 parseCsvData lst = Map.fromList $ conv <$> lst
247 where
248 conv (status, label, forms) =
249 (NgramsTerm label, NgramsRepoElement { _nre_size = 1
250 , _nre_list = case status == "map" of
251 True -> MapTerm
252 False -> case status == "main" of
253 True -> CandidateTerm
254 False -> StopTerm
255 , _nre_root = Nothing
256 , _nre_parent = Nothing
257 , _nre_children = MSet
258 $ Map.fromList
259 $ map (\form -> (NgramsTerm form, ()))
260 $ filter (/= "")
261 $ splitOn "|&|" forms
262 }
263 )
264
265 csvPost :: FlowCmdM env err m
266 => ListId
267 -> Text
268 -> m Bool
269 csvPost l m = do
270 printDebug "[csvPost] l" l
271 -- printDebug "[csvPost] m" m
272 -- status label forms
273 let lst = readCsvText m
274 let p = parseCsvData lst
275 --printDebug "[csvPost] lst" lst
276 printDebug "[csvPost] p" p
277 _ <- setListNgrams l NgramsTerms p
278 pure True
279
280 ------------------------------------------------------------------------
281 csvPostAsync :: GargServer CSVAPI
282 csvPostAsync lId =
283 serveJobsAPI $
284 JobFunction $ \f@(WithTextFile ft _ n) log' -> do
285 let log'' x = do
286 printDebug "[csvPostAsync] filetype" ft
287 printDebug "[csvPostAsync] name" n
288 liftBase $ log' x
289 csvPostAsync' lId f log''
290
291
292 csvPostAsync' :: FlowCmdM env err m
293 => ListId
294 -> WithTextFile
295 -> (JobLog -> m ())
296 -> m JobLog
297 csvPostAsync' l (WithTextFile _ m _) logStatus = do
298 logStatus JobLog { _scst_succeeded = Just 0
299 , _scst_failed = Just 0
300 , _scst_remaining = Just 1
301 , _scst_events = Just []
302 }
303 _r <- csvPost l m
304
305 pure JobLog { _scst_succeeded = Just 1
306 , _scst_failed = Just 0
307 , _scst_remaining = Just 0
308 , _scst_events = Just []
309 }
310 ------------------------------------------------------------------------