]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Ngrams/List.hs
Merge remote-tracking branch 'origin/481-dev-node-calc-upload' into dev
[gargantext.git] / src / Gargantext / API / Ngrams / List.hs
1 {-|
2 Module : Gargantext.API.Ngrams.List
3 Description : Get Ngrams (lists)
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE TemplateHaskell #-}
13 {-# LANGUAGE TypeOperators #-}
14
15 module Gargantext.API.Ngrams.List
16 where
17
18 import Control.Lens hiding (elements, Indexed)
19 import Data.Either (Either(..))
20 import Data.HashMap.Strict (HashMap)
21 import Data.Map (Map, toList)
22 import Data.Maybe (catMaybes, fromMaybe)
23 import Data.Set (Set)
24 import Data.Text (Text, concat, pack, splitOn)
25 import Data.Vector (Vector)
26 import Gargantext.API.Admin.EnvTypes (Env, GargJob(..))
27 import Gargantext.API.Admin.Orchestrator.Types
28 import Gargantext.API.Ngrams (setListNgrams)
29 import Gargantext.API.Ngrams.List.Types
30 import Gargantext.API.Ngrams.Prelude (getNgramsList)
31 import Gargantext.API.Ngrams.Tools (getTermsWith)
32 import Gargantext.API.Ngrams.Types
33 import Gargantext.API.Prelude (GargServer, GargM, GargError)
34 import Gargantext.API.Types
35 import Gargantext.Core.NodeStory
36 import Gargantext.Core.Text.Terms (ExtractedNgrams(..))
37 import Gargantext.Core.Text.Terms.WithList (buildPatterns, termsInText)
38 import Gargantext.Core.Types.Main (ListType(..))
39 import Gargantext.Database.Action.Flow (saveDocNgramsWith)
40 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
41 import Gargantext.Database.Admin.Types.Hyperdata.Document
42 import Gargantext.Database.Admin.Types.Node
43 import Gargantext.Database.Query.Table.Node (getNode)
44 import Gargantext.Database.Query.Table.NodeContext (selectDocNodes)
45 import Gargantext.Database.Schema.Context
46 import Gargantext.Database.Schema.Ngrams
47 import Gargantext.Database.Schema.Node (_node_parent_id)
48 import Gargantext.Database.Types (Indexed(..))
49 import Gargantext.Prelude
50 import Gargantext.Utils.Jobs (serveJobsAPI)
51 import Servant
52 -- import Servant.Job.Async
53 import qualified Data.ByteString.Lazy as BSL
54 import qualified Data.Csv as Csv
55 import qualified Data.HashMap.Strict as HashMap
56 import qualified Data.List as List
57 import qualified Data.Map as Map
58 import qualified Data.Set as Set
59 import qualified Data.Text as Text
60 import qualified Data.Vector as Vec
61 import qualified Gargantext.Database.Query.Table.Ngrams as TableNgrams
62 import qualified Gargantext.Utils.Servant as GUS
63 import qualified Prelude
64 import qualified Protolude as P
65 ------------------------------------------------------------------------
66 type GETAPI = Summary "Get List"
67 :> "lists"
68 :> Capture "listId" ListId
69 :> "json"
70 :> Get '[JSON, HTML] (Headers '[Header "Content-Disposition" Text] NgramsList)
71 :<|> "lists"
72 :> Capture "listId" ListId
73 :> "csv"
74 :> Get '[GUS.CSV] (Headers '[Header "Content-Disposition" Text] NgramsTableMap)
75 getApi :: GargServer GETAPI
76 getApi = getJson :<|> getCsv
77
78 ----------------------
79 type JSONAPI = Summary "Update List"
80 :> "lists"
81 :> Capture "listId" ListId
82 :> "add"
83 :> "form"
84 :> "async"
85 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
86
87 jsonApi :: ServerT JSONAPI (GargM Env GargError)
88 jsonApi = postAsync
89
90 ----------------------
91 type CSVAPI = Summary "Update List (legacy v3 CSV)"
92 :> "lists"
93 :> Capture "listId" ListId
94 :> "csv"
95 :> "add"
96 :> "form"
97 :> "async"
98 :> AsyncJobs JobLog '[FormUrlEncoded] WithTextFile JobLog
99
100 csvApi :: ServerT CSVAPI (GargM Env GargError)
101 csvApi = csvPostAsync
102
103 ------------------------------------------------------------------------
104 getJson :: HasNodeStory env err m =>
105 ListId -> m (Headers '[Header "Content-Disposition" Text] NgramsList)
106 getJson lId = do
107 lst <- getNgramsList lId
108 let (NodeId id') = lId
109 return $ addHeader (concat [ "attachment; filename=GarganText_NgramsList-"
110 , pack $ show id'
111 , ".json"
112 ]
113 ) lst
114
115 getCsv :: HasNodeStory env err m =>
116 ListId -> m (Headers '[Header "Content-Disposition" Text] NgramsTableMap)
117 getCsv lId = do
118 lst <- getNgramsList lId
119 let (NodeId id') = lId
120 return $ case Map.lookup TableNgrams.NgramsTerms lst of
121 Nothing -> noHeader Map.empty
122 Just (Versioned { _v_data }) ->
123 addHeader (concat [ "attachment; filename=GarganText_NgramsList-"
124 , pack $ show id'
125 , ".csv"
126 ]
127 ) _v_data
128
129 ------------------------------------------------------------------------
130 -- TODO : purge list
131 -- TODO talk
132 setList :: FlowCmdM env err m
133 => ListId
134 -> NgramsList
135 -> m Bool
136 setList l m = do
137 -- TODO check with Version for optim
138 printDebug "New list as file" l
139 _ <- mapM (\(nt, Versioned _v ns) -> setListNgrams l nt ns) $ toList m
140 -- TODO reindex
141 pure True
142
143 ------------------------------------------------------------------------
144 -- | Re-index documents of a corpus with new ngrams (called orphans here)
145 reIndexWith :: ( HasNodeStory env err m
146 , FlowCmdM env err m
147 )
148 => CorpusId
149 -> ListId
150 -> NgramsType
151 -> Set ListType
152 -> m ()
153 reIndexWith cId lId nt lts = do
154 printDebug "(cId,lId,nt,lts)" (cId, lId, nt, lts)
155
156 -- Getting [NgramsTerm]
157 ts <- List.concat
158 <$> map (\(k,vs) -> k:vs)
159 <$> HashMap.toList
160 <$> getTermsWith identity [lId] nt lts
161
162
163 let orphans = ts {- List.concat
164 $ map (\t -> case HashMap.lookup t occs of
165 Nothing -> [t]
166 Just n -> if n <= 1 then [t] else [ ]
167 ) ts
168 -}
169
170 printDebug "orphans" orphans
171
172 -- Get all documents of the corpus
173 docs <- selectDocNodes cId
174 -- printDebug "docs length" (List.length docs)
175
176 -- Checking Text documents where orphans match
177 -- TODO Tests here
178 let
179 -- fromListWith (<>)
180 ngramsByDoc = map (HashMap.fromList)
181 $ map (map (\((k, cnt), v) -> (SimpleNgrams (text2ngrams k), over (traverse . traverse) (\p -> (p, cnt)) v)))
182 $ map (\doc -> List.zip
183 (termsInText (buildPatterns $ map (\k -> (Text.splitOn " " $ unNgramsTerm k, [])) orphans)
184 $ Text.unlines $ catMaybes
185 [ doc ^. context_hyperdata . hd_title
186 , doc ^. context_hyperdata . hd_abstract
187 ]
188 )
189 (List.cycle [Map.fromList $ [(nt, Map.singleton (doc ^. context_id) 1 )]])
190 ) docs
191
192 printDebug "ngramsByDoc: " ngramsByDoc
193
194 -- Saving the indexation in database
195 _ <- mapM (saveDocNgramsWith lId) ngramsByDoc
196
197 pure ()
198
199 toIndexedNgrams :: HashMap Text NgramsId -> Text -> Maybe (Indexed Int Ngrams)
200 toIndexedNgrams m t = Indexed <$> i <*> n
201 where
202 i = HashMap.lookup t m
203 n = Just (text2ngrams t)
204
205 ------------------------------------------------------------------------
206 type PostAPI = Summary "Update List"
207 :> "add"
208 :> "form"
209 :> "async"
210 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
211
212 postAsync :: ListId -> ServerT PostAPI (GargM Env GargError)
213 postAsync lId =
214 serveJobsAPI UpdateNgramsListJobJSON $ \f log' ->
215 let
216 log'' x = do
217 -- printDebug "postAsync ListId" x
218 liftBase $ log' x
219 in postAsync' lId f log''
220
221 postAsync' :: FlowCmdM env err m
222 => ListId
223 -> WithFile
224 -> (JobLog -> m ())
225 -> m JobLog
226 postAsync' l (WithFile _ m _) logStatus = do
227
228 logStatus JobLog { _scst_succeeded = Just 0
229 , _scst_failed = Just 0
230 , _scst_remaining = Just 2
231 , _scst_events = Just []
232 }
233 printDebug "New list as file" l
234 _ <- setList l m
235 -- printDebug "Done" r
236
237 logStatus JobLog { _scst_succeeded = Just 1
238 , _scst_failed = Just 0
239 , _scst_remaining = Just 1
240 , _scst_events = Just []
241 }
242
243
244 corpus_node <- getNode l -- (Proxy :: Proxy HyperdataList)
245 let corpus_id = fromMaybe (panic "") (_node_parent_id corpus_node)
246 _ <- reIndexWith corpus_id l NgramsTerms (Set.fromList [MapTerm, CandidateTerm])
247
248 pure JobLog { _scst_succeeded = Just 2
249 , _scst_failed = Just 0
250 , _scst_remaining = Just 0
251 , _scst_events = Just []
252 }
253
254
255 ------------------------------------------------------------------------
256 type CSVPostAPI = Summary "Update List (legacy v3 CSV)"
257 :> "csv"
258 :> "add"
259 :> "form"
260 :> "async"
261 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
262
263 readCsvText :: Text -> [(Text, Text, Text)]
264 readCsvText t = case eDec of
265 Left _ -> []
266 Right dec -> Vec.toList dec
267 where
268 lt = BSL.fromStrict $ P.encodeUtf8 t
269 eDec = Csv.decodeWith
270 (Csv.defaultDecodeOptions { Csv.decDelimiter = fromIntegral (P.ord '\t') })
271 Csv.HasHeader lt :: Either Prelude.String (Vector (Text, Text, Text))
272
273 parseCsvData :: [(Text, Text, Text)] -> Map NgramsTerm NgramsRepoElement
274 parseCsvData lst = Map.fromList $ conv <$> lst
275 where
276 conv (status, label, forms) =
277 (NgramsTerm label, NgramsRepoElement { _nre_size = 1
278 , _nre_list = case status == "map" of
279 True -> MapTerm
280 False -> case status == "main" of
281 True -> CandidateTerm
282 False -> StopTerm
283 , _nre_root = Nothing
284 , _nre_parent = Nothing
285 , _nre_children = MSet
286 $ Map.fromList
287 $ map (\form -> (NgramsTerm form, ()))
288 $ filter (\w -> w /= "" && w /= label)
289 $ splitOn "|&|" forms
290 }
291 )
292
293 csvPost :: FlowCmdM env err m
294 => ListId
295 -> Text
296 -> m Bool
297 csvPost l m = do
298 printDebug "[csvPost] l" l
299 -- printDebug "[csvPost] m" m
300 -- status label forms
301 let lst = readCsvText m
302 let p = parseCsvData lst
303 --printDebug "[csvPost] lst" lst
304 printDebug "[csvPost] p" p
305 _ <- setListNgrams l NgramsTerms p
306 printDebug "ReIndexing List" l
307 corpus_node <- getNode l -- (Proxy :: Proxy HyperdataList)
308 let corpus_id = fromMaybe (panic "") (_node_parent_id corpus_node)
309 _ <- reIndexWith corpus_id l NgramsTerms (Set.fromList [MapTerm, CandidateTerm])
310
311 pure True
312
313 ------------------------------------------------------------------------
314 csvPostAsync :: ServerT CSVAPI (GargM Env GargError)
315 csvPostAsync lId =
316 serveJobsAPI UpdateNgramsListJobCSV $ \f@(WithTextFile ft _ n) log' -> do
317 let log'' x = do
318 printDebug "[csvPostAsync] filetype" ft
319 printDebug "[csvPostAsync] name" n
320 liftBase $ log' x
321 csvPostAsync' lId f log''
322
323
324 csvPostAsync' :: FlowCmdM env err m
325 => ListId
326 -> WithTextFile
327 -> (JobLog -> m ())
328 -> m JobLog
329 csvPostAsync' l (WithTextFile _ m _) logStatus = do
330 logStatus JobLog { _scst_succeeded = Just 0
331 , _scst_failed = Just 0
332 , _scst_remaining = Just 1
333 , _scst_events = Just []
334 }
335 _r <- csvPost l m
336
337 pure JobLog { _scst_succeeded = Just 1
338 , _scst_failed = Just 0
339 , _scst_remaining = Just 0
340 , _scst_events = Just []
341 }
342 ------------------------------------------------------------------------