]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Ngrams/List.hs
[FIX] Limit Size
[gargantext.git] / src / Gargantext / API / Ngrams / List.hs
1 {-|
2 Module : Gargantext.API.Ngrams.List
3 Description : Get Ngrams (lists)
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE TemplateHaskell #-}
13 {-# LANGUAGE TypeOperators #-}
14
15 module Gargantext.API.Ngrams.List
16 where
17
18 import Control.Lens hiding (elements, Indexed)
19 import Data.Either (Either(..))
20 import Data.HashMap.Strict (HashMap)
21 import Data.Map (Map, toList)
22 import Data.Maybe (catMaybes, fromMaybe)
23 import Data.Set (Set)
24 import Data.Text (Text, concat, pack, splitOn)
25 import Data.Vector (Vector)
26 import Gargantext.API.Admin.Orchestrator.Types
27 import Gargantext.API.Ngrams (setListNgrams)
28 import Gargantext.API.Ngrams.List.Types
29 import Gargantext.API.Ngrams.Prelude (getNgramsList)
30 import Gargantext.API.Ngrams.Tools (getTermsWith)
31 import Gargantext.API.Ngrams.Types
32 import Gargantext.API.Prelude (GargServer)
33 import Gargantext.API.Types
34 import Gargantext.Core.NodeStory
35 import Gargantext.Core.Text.Terms (ExtractedNgrams(..))
36 import Gargantext.Core.Text.Terms.WithList (buildPatterns, termsInText)
37 import Gargantext.Core.Types.Main (ListType(..))
38 import Gargantext.Database.Action.Flow (saveDocNgramsWith)
39 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
40 import Gargantext.Database.Admin.Types.Hyperdata.Document
41 import Gargantext.Database.Admin.Types.Node
42 import Gargantext.Database.Query.Table.Node (getNode)
43 import Gargantext.Database.Query.Table.NodeContext (selectDocNodes)
44 import Gargantext.Database.Schema.Context
45 import Gargantext.Database.Schema.Ngrams
46 import Gargantext.Database.Schema.Node (_node_parent_id)
47 import Gargantext.Database.Types (Indexed(..))
48 import Gargantext.Prelude
49 import Servant
50 import Servant.Job.Async
51 import qualified Data.ByteString.Lazy as BSL
52 import qualified Data.Csv as Csv
53 import qualified Data.HashMap.Strict as HashMap
54 import qualified Data.List as List
55 import qualified Data.Map as Map
56 import qualified Data.Set as Set
57 import qualified Data.Text as Text
58 import qualified Data.Vector as Vec
59 import qualified Prelude
60 import qualified Protolude as P
61 ------------------------------------------------------------------------
62 type GETAPI = Summary "Get List"
63 :> "lists"
64 :> Capture "listId" ListId
65 :> Get '[JSON, HTML] (Headers '[Header "Content-Disposition" Text] NgramsList)
66 getApi :: GargServer GETAPI
67 getApi = get
68
69 ----------------------
70 type JSONAPI = Summary "Update List"
71 :> "lists"
72 :> Capture "listId" ListId
73 :> "add"
74 :> "form"
75 :> "async"
76 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
77
78 jsonApi :: GargServer JSONAPI
79 jsonApi = postAsync
80
81 ----------------------
82 type CSVAPI = Summary "Update List (legacy v3 CSV)"
83 :> "lists"
84 :> Capture "listId" ListId
85 :> "csv"
86 :> "add"
87 :> "form"
88 :> "async"
89 :> AsyncJobs JobLog '[FormUrlEncoded] WithTextFile JobLog
90
91 csvApi :: GargServer CSVAPI
92 csvApi = csvPostAsync
93
94 ------------------------------------------------------------------------
95 get :: HasNodeStory env err m =>
96 ListId -> m (Headers '[Header "Content-Disposition" Text] NgramsList)
97 get lId = do
98 lst <- getNgramsList lId
99 let (NodeId id') = lId
100 return $ addHeader (concat [ "attachment; filename=GarganText_NgramsList-"
101 , pack $ show id'
102 , ".json"
103 ]
104 ) lst
105
106 ------------------------------------------------------------------------
107 -- TODO : purge list
108 -- TODO talk
109 setList :: FlowCmdM env err m
110 => ListId
111 -> NgramsList
112 -> m Bool
113 setList l m = do
114 -- TODO check with Version for optim
115 printDebug "New list as file" l
116 _ <- mapM (\(nt, Versioned _v ns) -> setListNgrams l nt ns) $ toList m
117 -- TODO reindex
118 pure True
119
120 ------------------------------------------------------------------------
121 -- | Re-index documents of a corpus with new ngrams (called orphans here)
122 reIndexWith :: ( HasNodeStory env err m
123 , FlowCmdM env err m
124 )
125 => CorpusId
126 -> ListId
127 -> NgramsType
128 -> Set ListType
129 -> m ()
130 reIndexWith cId lId nt lts = do
131 -- Getting [NgramsTerm]
132 ts <- List.concat
133 <$> map (\(k,vs) -> k:vs)
134 <$> HashMap.toList
135 <$> getTermsWith identity [lId] nt lts
136
137 -- printDebug "ts" ts
138
139 -- Taking the ngrams with 0 occurrences only (orphans)
140 -- occs <- getOccByNgramsOnlyFast' cId lId nt ts
141
142 -- printDebug "occs" occs
143
144 let orphans = ts {- List.concat
145 $ map (\t -> case HashMap.lookup t occs of
146 Nothing -> [t]
147 Just n -> if n <= 1 then [t] else [ ]
148 ) ts
149 -}
150 -- printDebug "orphans" orphans
151
152 -- Get all documents of the corpus
153 docs <- selectDocNodes cId
154 -- printDebug "docs length" (List.length docs)
155
156 -- Checking Text documents where orphans match
157 -- TODO Tests here
158 let
159 ngramsByDoc = map (HashMap.fromList)
160 $ map (map (\(k,v) -> (SimpleNgrams (text2ngrams k), v)))
161 $ map (\doc -> List.zip
162 (termsInText (buildPatterns $ map (\k -> (Text.splitOn " " $ unNgramsTerm k, [])) orphans)
163 $ Text.unlines $ catMaybes
164 [ doc ^. context_hyperdata . hd_title
165 , doc ^. context_hyperdata . hd_abstract
166 ]
167 )
168 (List.cycle [Map.fromList $ [(nt, Map.singleton (doc ^. context_id) 1 )]])
169 ) docs
170
171 -- printDebug "ngramsByDoc" ngramsByDoc
172
173 -- Saving the indexation in database
174 _ <- mapM (saveDocNgramsWith lId) ngramsByDoc
175
176 pure () -- ngramsByDoc
177
178 toIndexedNgrams :: HashMap Text NgramsId -> Text -> Maybe (Indexed Int Ngrams)
179 toIndexedNgrams m t = Indexed <$> i <*> n
180 where
181 i = HashMap.lookup t m
182 n = Just (text2ngrams t)
183
184 ------------------------------------------------------------------------
185 type PostAPI = Summary "Update List"
186 :> "add"
187 :> "form"
188 :> "async"
189 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
190
191 postAsync :: GargServer JSONAPI
192 postAsync lId =
193 serveJobsAPI $
194 JobFunction (\f log' ->
195 let
196 log'' x = do
197 -- printDebug "postAsync ListId" x
198 liftBase $ log' x
199 in postAsync' lId f log'')
200
201 postAsync' :: FlowCmdM env err m
202 => ListId
203 -> WithFile
204 -> (JobLog -> m ())
205 -> m JobLog
206 postAsync' l (WithFile _ m _) logStatus = do
207
208 logStatus JobLog { _scst_succeeded = Just 0
209 , _scst_failed = Just 0
210 , _scst_remaining = Just 2
211 , _scst_events = Just []
212 }
213 printDebug "New list as file" l
214 _ <- setList l m
215 -- printDebug "Done" r
216
217 logStatus JobLog { _scst_succeeded = Just 1
218 , _scst_failed = Just 0
219 , _scst_remaining = Just 1
220 , _scst_events = Just []
221 }
222
223
224 corpus_node <- getNode l -- (Proxy :: Proxy HyperdataList)
225 let corpus_id = fromMaybe (panic "") (_node_parent_id corpus_node)
226 _ <- reIndexWith corpus_id l NgramsTerms (Set.fromList [MapTerm, CandidateTerm])
227
228 pure JobLog { _scst_succeeded = Just 2
229 , _scst_failed = Just 0
230 , _scst_remaining = Just 0
231 , _scst_events = Just []
232 }
233
234
235 ------------------------------------------------------------------------
236 type CSVPostAPI = Summary "Update List (legacy v3 CSV)"
237 :> "csv"
238 :> "add"
239 :> "form"
240 :> "async"
241 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
242
243 readCsvText :: Text -> [(Text, Text, Text)]
244 readCsvText t = case eDec of
245 Left _ -> []
246 Right dec -> Vec.toList dec
247 where
248 lt = BSL.fromStrict $ P.encodeUtf8 t
249 eDec = Csv.decodeWith
250 (Csv.defaultDecodeOptions { Csv.decDelimiter = fromIntegral (P.ord '\t') })
251 Csv.HasHeader lt :: Either Prelude.String (Vector (Text, Text, Text))
252
253 parseCsvData :: [(Text, Text, Text)] -> Map NgramsTerm NgramsRepoElement
254 parseCsvData lst = Map.fromList $ conv <$> lst
255 where
256 conv (status, label, forms) =
257 (NgramsTerm label, NgramsRepoElement { _nre_size = 1
258 , _nre_list = case status == "map" of
259 True -> MapTerm
260 False -> case status == "main" of
261 True -> CandidateTerm
262 False -> StopTerm
263 , _nre_root = Nothing
264 , _nre_parent = Nothing
265 , _nre_children = MSet
266 $ Map.fromList
267 $ map (\form -> (NgramsTerm form, ()))
268 $ filter (/= "")
269 $ splitOn "|&|" forms
270 }
271 )
272
273 csvPost :: FlowCmdM env err m
274 => ListId
275 -> Text
276 -> m Bool
277 csvPost l m = do
278 printDebug "[csvPost] l" l
279 -- printDebug "[csvPost] m" m
280 -- status label forms
281 let lst = readCsvText m
282 let p = parseCsvData lst
283 --printDebug "[csvPost] lst" lst
284 printDebug "[csvPost] p" p
285 _ <- setListNgrams l NgramsTerms p
286 printDebug "ReIndexing List" l
287 corpus_node <- getNode l -- (Proxy :: Proxy HyperdataList)
288 let corpus_id = fromMaybe (panic "") (_node_parent_id corpus_node)
289 _ <- reIndexWith corpus_id l NgramsTerms (Set.fromList [MapTerm, CandidateTerm])
290
291 pure True
292
293 ------------------------------------------------------------------------
294 csvPostAsync :: GargServer CSVAPI
295 csvPostAsync lId =
296 serveJobsAPI $
297 JobFunction $ \f@(WithTextFile ft _ n) log' -> do
298 let log'' x = do
299 printDebug "[csvPostAsync] filetype" ft
300 printDebug "[csvPostAsync] name" n
301 liftBase $ log' x
302 csvPostAsync' lId f log''
303
304
305 csvPostAsync' :: FlowCmdM env err m
306 => ListId
307 -> WithTextFile
308 -> (JobLog -> m ())
309 -> m JobLog
310 csvPostAsync' l (WithTextFile _ m _) logStatus = do
311 logStatus JobLog { _scst_succeeded = Just 0
312 , _scst_failed = Just 0
313 , _scst_remaining = Just 1
314 , _scst_events = Just []
315 }
316 _r <- csvPost l m
317
318 pure JobLog { _scst_succeeded = Just 1
319 , _scst_failed = Just 0
320 , _scst_remaining = Just 0
321 , _scst_events = Just []
322 }
323 ------------------------------------------------------------------------