]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Ngrams/List.hs
[ngrams] send array of context ids, instead of occurrences int
[gargantext.git] / src / Gargantext / API / Ngrams / List.hs
1 {-|
2 Module : Gargantext.API.Ngrams.List
3 Description : Get Ngrams (lists)
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE TemplateHaskell #-}
13 {-# LANGUAGE TypeOperators #-}
14
15 module Gargantext.API.Ngrams.List
16 where
17
18 import Control.Lens hiding (elements, Indexed)
19 import Data.Either (Either(..))
20 import Data.HashMap.Strict (HashMap)
21 import Data.Map (Map, toList)
22 import Data.Maybe (catMaybes, fromMaybe)
23 import Data.Set (Set)
24 import Data.Text (Text, concat, pack, splitOn)
25 import Data.Vector (Vector)
26 import Gargantext.API.Admin.EnvTypes (Env, GargJob(..))
27 import Gargantext.API.Admin.Orchestrator.Types
28 import Gargantext.API.Ngrams (setListNgrams)
29 import Gargantext.API.Ngrams.List.Types
30 import Gargantext.API.Ngrams.Prelude (getNgramsList)
31 import Gargantext.API.Ngrams.Tools (getTermsWith)
32 import Gargantext.API.Ngrams.Types
33 import Gargantext.API.Prelude (GargServer, GargM, GargError)
34 import Gargantext.API.Types
35 import Gargantext.Core.NodeStory
36 import Gargantext.Core.Text.Terms (ExtractedNgrams(..))
37 import Gargantext.Core.Text.Terms.WithList (buildPatterns, termsInText)
38 import Gargantext.Core.Types.Main (ListType(..))
39 import Gargantext.Database.Action.Flow (saveDocNgramsWith)
40 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
41 import Gargantext.Database.Admin.Types.Hyperdata.Document
42 import Gargantext.Database.Admin.Types.Node
43 import Gargantext.Database.Query.Table.Node (getNode)
44 import Gargantext.Database.Query.Table.NodeContext (selectDocNodes)
45 import Gargantext.Database.Schema.Context
46 import Gargantext.Database.Schema.Ngrams
47 import Gargantext.Database.Schema.Node (_node_parent_id)
48 import Gargantext.Database.Types (Indexed(..))
49 import Gargantext.Prelude
50 import Gargantext.Utils.Jobs (serveJobsAPI)
51 import Servant
52 -- import Servant.Job.Async
53 import qualified Data.ByteString.Lazy as BSL
54 import qualified Data.Csv as Csv
55 import qualified Data.HashMap.Strict as HashMap
56 import qualified Data.List as List
57 import qualified Data.Map as Map
58 import qualified Data.Set as Set
59 import qualified Data.Text as Text
60 import qualified Data.Vector as Vec
61 import qualified Prelude
62 import qualified Protolude as P
63 ------------------------------------------------------------------------
64 type GETAPI = Summary "Get List"
65 :> "lists"
66 :> Capture "listId" ListId
67 :> Get '[JSON, HTML] (Headers '[Header "Content-Disposition" Text] NgramsList)
68 getApi :: GargServer GETAPI
69 getApi = get
70
71 ----------------------
72 type JSONAPI = Summary "Update List"
73 :> "lists"
74 :> Capture "listId" ListId
75 :> "add"
76 :> "form"
77 :> "async"
78 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
79
80 jsonApi :: ServerT JSONAPI (GargM Env GargError)
81 jsonApi = postAsync
82
83 ----------------------
84 type CSVAPI = Summary "Update List (legacy v3 CSV)"
85 :> "lists"
86 :> Capture "listId" ListId
87 :> "csv"
88 :> "add"
89 :> "form"
90 :> "async"
91 :> AsyncJobs JobLog '[FormUrlEncoded] WithTextFile JobLog
92
93 csvApi :: ServerT CSVAPI (GargM Env GargError)
94 csvApi = csvPostAsync
95
96 ------------------------------------------------------------------------
97 get :: HasNodeStory env err m =>
98 ListId -> m (Headers '[Header "Content-Disposition" Text] NgramsList)
99 get lId = do
100 lst <- getNgramsList lId
101 let (NodeId id') = lId
102 return $ addHeader (concat [ "attachment; filename=GarganText_NgramsList-"
103 , pack $ show id'
104 , ".json"
105 ]
106 ) lst
107
108 ------------------------------------------------------------------------
109 -- TODO : purge list
110 -- TODO talk
111 setList :: FlowCmdM env err m
112 => ListId
113 -> NgramsList
114 -> m Bool
115 setList l m = do
116 -- TODO check with Version for optim
117 printDebug "New list as file" l
118 _ <- mapM (\(nt, Versioned _v ns) -> setListNgrams l nt ns) $ toList m
119 -- TODO reindex
120 pure True
121
122 ------------------------------------------------------------------------
123 -- | Re-index documents of a corpus with new ngrams (called orphans here)
124 reIndexWith :: ( HasNodeStory env err m
125 , FlowCmdM env err m
126 )
127 => CorpusId
128 -> ListId
129 -> NgramsType
130 -> Set ListType
131 -> m ()
132 reIndexWith cId lId nt lts = do
133 -- Getting [NgramsTerm]
134 ts <- List.concat
135 <$> map (\(k,vs) -> k:vs)
136 <$> HashMap.toList
137 <$> getTermsWith identity [lId] nt lts
138
139 -- printDebug "ts" ts
140
141 -- Taking the ngrams with 0 occurrences only (orphans)
142 -- occs <- getOccByNgramsOnlyFast' cId lId nt ts
143
144 -- printDebug "occs" occs
145
146 let orphans = ts {- List.concat
147 $ map (\t -> case HashMap.lookup t occs of
148 Nothing -> [t]
149 Just n -> if n <= 1 then [t] else [ ]
150 ) ts
151 -}
152 -- printDebug "orphans" orphans
153
154 -- Get all documents of the corpus
155 docs <- selectDocNodes cId
156 -- printDebug "docs length" (List.length docs)
157
158 -- Checking Text documents where orphans match
159 -- TODO Tests here
160 let
161 ngramsByDoc = map (HashMap.fromListWith (<>))
162 $ map (map (\(k,v) -> (SimpleNgrams (text2ngrams k), v)))
163 $ map (\doc -> List.zip
164 (termsInText (buildPatterns $ map (\k -> (Text.splitOn " " $ unNgramsTerm k, [])) orphans)
165 $ Text.unlines $ catMaybes
166 [ doc ^. context_hyperdata . hd_title
167 , doc ^. context_hyperdata . hd_abstract
168 ]
169 )
170 (List.cycle [Map.fromList $ [(nt, Map.singleton (doc ^. context_id) 1 )]])
171 ) docs
172
173 -- printDebug "ngramsByDoc" ngramsByDoc
174
175 -- Saving the indexation in database
176 _ <- mapM (saveDocNgramsWith lId) ngramsByDoc
177
178 pure () -- ngramsByDoc
179
180 toIndexedNgrams :: HashMap Text NgramsId -> Text -> Maybe (Indexed Int Ngrams)
181 toIndexedNgrams m t = Indexed <$> i <*> n
182 where
183 i = HashMap.lookup t m
184 n = Just (text2ngrams t)
185
186 ------------------------------------------------------------------------
187 type PostAPI = Summary "Update List"
188 :> "add"
189 :> "form"
190 :> "async"
191 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
192
193 postAsync :: ListId -> ServerT PostAPI (GargM Env GargError)
194 postAsync lId =
195 serveJobsAPI UpdateNgramsListJobJSON $ \f log' ->
196 let
197 log'' x = do
198 -- printDebug "postAsync ListId" x
199 liftBase $ log' x
200 in postAsync' lId f log''
201
202 postAsync' :: FlowCmdM env err m
203 => ListId
204 -> WithFile
205 -> (JobLog -> m ())
206 -> m JobLog
207 postAsync' l (WithFile _ m _) logStatus = do
208
209 logStatus JobLog { _scst_succeeded = Just 0
210 , _scst_failed = Just 0
211 , _scst_remaining = Just 2
212 , _scst_events = Just []
213 }
214 printDebug "New list as file" l
215 _ <- setList l m
216 -- printDebug "Done" r
217
218 logStatus JobLog { _scst_succeeded = Just 1
219 , _scst_failed = Just 0
220 , _scst_remaining = Just 1
221 , _scst_events = Just []
222 }
223
224
225 corpus_node <- getNode l -- (Proxy :: Proxy HyperdataList)
226 let corpus_id = fromMaybe (panic "") (_node_parent_id corpus_node)
227 _ <- reIndexWith corpus_id l NgramsTerms (Set.fromList [MapTerm, CandidateTerm])
228
229 pure JobLog { _scst_succeeded = Just 2
230 , _scst_failed = Just 0
231 , _scst_remaining = Just 0
232 , _scst_events = Just []
233 }
234
235
236 ------------------------------------------------------------------------
237 type CSVPostAPI = Summary "Update List (legacy v3 CSV)"
238 :> "csv"
239 :> "add"
240 :> "form"
241 :> "async"
242 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
243
244 readCsvText :: Text -> [(Text, Text, Text)]
245 readCsvText t = case eDec of
246 Left _ -> []
247 Right dec -> Vec.toList dec
248 where
249 lt = BSL.fromStrict $ P.encodeUtf8 t
250 eDec = Csv.decodeWith
251 (Csv.defaultDecodeOptions { Csv.decDelimiter = fromIntegral (P.ord '\t') })
252 Csv.HasHeader lt :: Either Prelude.String (Vector (Text, Text, Text))
253
254 parseCsvData :: [(Text, Text, Text)] -> Map NgramsTerm NgramsRepoElement
255 parseCsvData lst = Map.fromList $ conv <$> lst
256 where
257 conv (status, label, forms) =
258 (NgramsTerm label, NgramsRepoElement { _nre_size = 1
259 , _nre_list = case status == "map" of
260 True -> MapTerm
261 False -> case status == "main" of
262 True -> CandidateTerm
263 False -> StopTerm
264 , _nre_root = Nothing
265 , _nre_parent = Nothing
266 , _nre_children = MSet
267 $ Map.fromList
268 $ map (\form -> (NgramsTerm form, ()))
269 $ filter (\w -> w /= "" && w /= label)
270 $ splitOn "|&|" forms
271 }
272 )
273
274 csvPost :: FlowCmdM env err m
275 => ListId
276 -> Text
277 -> m Bool
278 csvPost l m = do
279 printDebug "[csvPost] l" l
280 -- printDebug "[csvPost] m" m
281 -- status label forms
282 let lst = readCsvText m
283 let p = parseCsvData lst
284 --printDebug "[csvPost] lst" lst
285 printDebug "[csvPost] p" p
286 _ <- setListNgrams l NgramsTerms p
287 printDebug "ReIndexing List" l
288 corpus_node <- getNode l -- (Proxy :: Proxy HyperdataList)
289 let corpus_id = fromMaybe (panic "") (_node_parent_id corpus_node)
290 _ <- reIndexWith corpus_id l NgramsTerms (Set.fromList [MapTerm, CandidateTerm])
291
292 pure True
293
294 ------------------------------------------------------------------------
295 csvPostAsync :: ServerT CSVAPI (GargM Env GargError)
296 csvPostAsync lId =
297 serveJobsAPI UpdateNgramsListJobCSV $ \f@(WithTextFile ft _ n) log' -> do
298 let log'' x = do
299 printDebug "[csvPostAsync] filetype" ft
300 printDebug "[csvPostAsync] name" n
301 liftBase $ log' x
302 csvPostAsync' lId f log''
303
304
305 csvPostAsync' :: FlowCmdM env err m
306 => ListId
307 -> WithTextFile
308 -> (JobLog -> m ())
309 -> m JobLog
310 csvPostAsync' l (WithTextFile _ m _) logStatus = do
311 logStatus JobLog { _scst_succeeded = Just 0
312 , _scst_failed = Just 0
313 , _scst_remaining = Just 1
314 , _scst_events = Just []
315 }
316 _r <- csvPost l m
317
318 pure JobLog { _scst_succeeded = Just 1
319 , _scst_failed = Just 0
320 , _scst_remaining = Just 0
321 , _scst_events = Just []
322 }
323 ------------------------------------------------------------------------