]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Ngrams/List.hs
Update README.md
[gargantext.git] / src / Gargantext / API / Ngrams / List.hs
1 {-|
2 Module : Gargantext.API.Ngrams.List
3 Description : Get Ngrams (lists)
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE TemplateHaskell #-}
13 {-# LANGUAGE TypeOperators #-}
14
15 module Gargantext.API.Ngrams.List
16 where
17
18 import Control.Lens hiding (elements, Indexed)
19 import Data.Either (Either(..))
20 import Data.HashMap.Strict (HashMap)
21 import Data.Map.Strict (Map, toList)
22 import Data.Maybe (catMaybes, fromMaybe)
23 import Data.Set (Set)
24 import Data.Text (Text, concat, pack, splitOn)
25 import Data.Vector (Vector)
26 import Gargantext.API.Admin.EnvTypes (Env, GargJob(..))
27 import Gargantext.API.Admin.Orchestrator.Types
28 import Gargantext.API.Ngrams (setListNgrams)
29 import Gargantext.API.Ngrams.List.Types
30 import Gargantext.API.Ngrams.Prelude (getNgramsList)
31 import Gargantext.API.Ngrams.Tools (getTermsWith)
32 import Gargantext.API.Ngrams.Types
33 import Gargantext.API.Prelude (GargServer, GargM, GargError)
34 import Gargantext.API.Types
35 import Gargantext.Core.NodeStory
36 import Gargantext.Core.Text.Terms (ExtractedNgrams(..))
37 import Gargantext.Core.Text.Terms.WithList (buildPatterns, termsInText)
38 import Gargantext.Core.Types.Main (ListType(..))
39 import Gargantext.Database.Action.Flow (saveDocNgramsWith)
40 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
41 import Gargantext.Database.Admin.Types.Hyperdata.Document
42 import Gargantext.Database.Admin.Types.Node
43 import Gargantext.Database.Query.Table.Node (getNode)
44 import Gargantext.Database.Query.Table.NodeContext (selectDocNodes)
45 import Gargantext.Database.Schema.Context
46 import Gargantext.Database.Schema.Ngrams
47 import Gargantext.Database.Schema.Node (_node_parent_id)
48 import Gargantext.Database.Types (Indexed(..))
49 import Gargantext.Prelude
50 import Gargantext.Utils.Jobs (serveJobsAPI)
51 import Servant
52 -- import Servant.Job.Async
53 import qualified Data.ByteString.Lazy as BSL
54 import qualified Data.Csv as Csv
55 import qualified Data.HashMap.Strict as HashMap
56 import qualified Data.List as List
57 import qualified Data.Map.Strict as Map
58 import qualified Data.Set as Set
59 import qualified Data.Text as Text
60 import qualified Data.Vector as Vec
61 import qualified Gargantext.Database.Query.Table.Ngrams as TableNgrams
62 import qualified Gargantext.Utils.Servant as GUS
63 import qualified Prelude
64 import qualified Protolude as P
65 ------------------------------------------------------------------------
66 type GETAPI = Summary "Get List"
67 :> "lists"
68 :> Capture "listId" ListId
69 :> "json"
70 :> Get '[JSON, HTML] (Headers '[Header "Content-Disposition" Text] NgramsList)
71 :<|> "lists"
72 :> Capture "listId" ListId
73 :> "csv"
74 :> Get '[GUS.CSV] (Headers '[Header "Content-Disposition" Text] NgramsTableMap)
75 getApi :: GargServer GETAPI
76 getApi = getJson :<|> getCsv
77
78 ----------------------
79 type JSONAPI = Summary "Update List"
80 :> "lists"
81 :> Capture "listId" ListId
82 :> "add"
83 :> "form"
84 :> "async"
85 :> AsyncJobs JobLog '[FormUrlEncoded] WithJsonFile JobLog
86
87 jsonApi :: ServerT JSONAPI (GargM Env GargError)
88 jsonApi = jsonPostAsync
89
90 ----------------------
91 type CSVAPI = Summary "Update List (legacy v3 CSV)"
92 :> "lists"
93 :> Capture "listId" ListId
94 :> "csv"
95 :> "add"
96 :> "form"
97 :> "async"
98 :> AsyncJobs JobLog '[FormUrlEncoded] WithTextFile JobLog
99
100 csvApi :: ServerT CSVAPI (GargM Env GargError)
101 csvApi = csvPostAsync
102
103 ------------------------------------------------------------------------
104 getJson :: HasNodeStory env err m =>
105 ListId -> m (Headers '[Header "Content-Disposition" Text] NgramsList)
106 getJson lId = do
107 lst <- getNgramsList lId
108 let (NodeId id') = lId
109 return $ addHeader (concat [ "attachment; filename=GarganText_NgramsList-"
110 , pack $ show id'
111 , ".json"
112 ]
113 ) lst
114
115 getCsv :: HasNodeStory env err m =>
116 ListId -> m (Headers '[Header "Content-Disposition" Text] NgramsTableMap)
117 getCsv lId = do
118 lst <- getNgramsList lId
119 let (NodeId id') = lId
120 return $ case Map.lookup TableNgrams.NgramsTerms lst of
121 Nothing -> noHeader Map.empty
122 Just (Versioned { _v_data }) ->
123 addHeader (concat [ "attachment; filename=GarganText_NgramsList-"
124 , pack $ show id'
125 , ".csv"
126 ]
127 ) _v_data
128
129 ------------------------------------------------------------------------
130 -- TODO : purge list
131 -- TODO talk
132 setList :: FlowCmdM env err m
133 => ListId
134 -> NgramsList
135 -> m Bool
136 setList l m = do
137 -- TODO check with Version for optim
138 printDebug "New list as file" l
139 _ <- mapM (\(nt, Versioned _v ns) -> setListNgrams l nt ns) $ toList m
140 -- TODO reindex
141 pure True
142
143 ------------------------------------------------------------------------
144 -- | Re-index documents of a corpus with new ngrams (called orphans here)
145 reIndexWith :: ( HasNodeStory env err m
146 , FlowCmdM env err m
147 )
148 => CorpusId
149 -> ListId
150 -> NgramsType
151 -> Set ListType
152 -> m ()
153 reIndexWith cId lId nt lts = do
154 printDebug "(cId,lId,nt,lts)" (cId, lId, nt, lts)
155
156 -- Getting [NgramsTerm]
157 ts <- List.concat
158 <$> map (\(k,vs) -> k:vs)
159 <$> HashMap.toList
160 <$> getTermsWith identity [lId] nt lts
161
162
163 let orphans = ts {- List.concat
164 $ map (\t -> case HashMap.lookup t occs of
165 Nothing -> [t]
166 Just n -> if n <= 1 then [t] else [ ]
167 ) ts
168 -}
169
170 printDebug "orphans" orphans
171
172 -- Get all documents of the corpus
173 docs <- selectDocNodes cId
174 -- printDebug "docs length" (List.length docs)
175
176 -- Checking Text documents where orphans match
177 -- TODO Tests here
178 let
179 -- fromListWith (<>)
180 ngramsByDoc = map (HashMap.fromList)
181 $ map (map (\((k, cnt), v) -> (SimpleNgrams (text2ngrams k), over (traverse . traverse) (\p -> (p, cnt)) v)))
182 $ map (\doc -> List.zip
183 (termsInText (buildPatterns $ map (\k -> (Text.splitOn " " $ unNgramsTerm k, [])) orphans)
184 $ Text.unlines $ catMaybes
185 [ doc ^. context_hyperdata . hd_title
186 , doc ^. context_hyperdata . hd_abstract
187 ]
188 )
189 (List.cycle [Map.fromList $ [(nt, Map.singleton (doc ^. context_id) 1 )]])
190 ) docs
191
192 printDebug "ngramsByDoc: " ngramsByDoc
193
194 -- Saving the indexation in database
195 _ <- mapM (saveDocNgramsWith lId) ngramsByDoc
196
197 pure ()
198
199 toIndexedNgrams :: HashMap Text NgramsId -> Text -> Maybe (Indexed Int Ngrams)
200 toIndexedNgrams m t = Indexed <$> i <*> n
201 where
202 i = HashMap.lookup t m
203 n = Just (text2ngrams t)
204
205 ------------------------------------------------------------------------
206 jsonPostAsync :: ServerT JSONAPI (GargM Env GargError)
207 jsonPostAsync lId =
208 serveJobsAPI UpdateNgramsListJobJSON $ \f log' ->
209 let
210 log'' x = do
211 -- printDebug "postAsync ListId" x
212 liftBase $ log' x
213 in postAsync' lId f log''
214
215 postAsync' :: FlowCmdM env err m
216 => ListId
217 -> WithJsonFile
218 -> (JobLog -> m ())
219 -> m JobLog
220 postAsync' l (WithJsonFile m _) logStatus = do
221
222 logStatus JobLog { _scst_succeeded = Just 0
223 , _scst_failed = Just 0
224 , _scst_remaining = Just 2
225 , _scst_events = Just []
226 }
227 printDebug "New list as file" l
228 _ <- setList l m
229 -- printDebug "Done" r
230
231 logStatus JobLog { _scst_succeeded = Just 1
232 , _scst_failed = Just 0
233 , _scst_remaining = Just 1
234 , _scst_events = Just []
235 }
236
237
238 corpus_node <- getNode l -- (Proxy :: Proxy HyperdataList)
239 let corpus_id = fromMaybe (panic "") (_node_parent_id corpus_node)
240 _ <- reIndexWith corpus_id l NgramsTerms (Set.fromList [MapTerm, CandidateTerm])
241
242 pure JobLog { _scst_succeeded = Just 2
243 , _scst_failed = Just 0
244 , _scst_remaining = Just 0
245 , _scst_events = Just []
246 }
247
248
249 ------------------------------------------------------------------------
250
251 readCsvText :: Text -> [(Text, Text, Text)]
252 readCsvText t = case eDec of
253 Left _ -> []
254 Right dec -> Vec.toList dec
255 where
256 lt = BSL.fromStrict $ P.encodeUtf8 t
257 eDec = Csv.decodeWith
258 (Csv.defaultDecodeOptions { Csv.decDelimiter = fromIntegral (P.ord '\t') })
259 Csv.HasHeader lt :: Either Prelude.String (Vector (Text, Text, Text))
260
261 parseCsvData :: [(Text, Text, Text)] -> Map NgramsTerm NgramsRepoElement
262 parseCsvData lst = Map.fromList $ conv <$> lst
263 where
264 conv (status, label, forms) =
265 (NgramsTerm label, NgramsRepoElement { _nre_size = 1
266 , _nre_list = case status == "map" of
267 True -> MapTerm
268 False -> case status == "main" of
269 True -> CandidateTerm
270 False -> StopTerm
271 , _nre_root = Nothing
272 , _nre_parent = Nothing
273 , _nre_children = MSet
274 $ Map.fromList
275 $ map (\form -> (NgramsTerm form, ()))
276 $ filter (\w -> w /= "" && w /= label)
277 $ splitOn "|&|" forms
278 }
279 )
280
281 csvPost :: FlowCmdM env err m
282 => ListId
283 -> Text
284 -> m Bool
285 csvPost l m = do
286 printDebug "[csvPost] l" l
287 -- printDebug "[csvPost] m" m
288 -- status label forms
289 let lst = readCsvText m
290 let p = parseCsvData lst
291 --printDebug "[csvPost] lst" lst
292 printDebug "[csvPost] p" p
293 _ <- setListNgrams l NgramsTerms p
294 printDebug "ReIndexing List" l
295 corpus_node <- getNode l -- (Proxy :: Proxy HyperdataList)
296 let corpus_id = fromMaybe (panic "") (_node_parent_id corpus_node)
297 _ <- reIndexWith corpus_id l NgramsTerms (Set.fromList [MapTerm, CandidateTerm])
298
299 pure True
300
301 ------------------------------------------------------------------------
302 csvPostAsync :: ServerT CSVAPI (GargM Env GargError)
303 csvPostAsync lId =
304 serveJobsAPI UpdateNgramsListJobCSV $ \f@(WithTextFile ft _ n) log' -> do
305 let log'' x = do
306 printDebug "[csvPostAsync] filetype" ft
307 printDebug "[csvPostAsync] name" n
308 liftBase $ log' x
309 csvPostAsync' lId f log''
310
311
312 csvPostAsync' :: FlowCmdM env err m
313 => ListId
314 -> WithTextFile
315 -> (JobLog -> m ())
316 -> m JobLog
317 csvPostAsync' l (WithTextFile _ m _) logStatus = do
318 logStatus JobLog { _scst_succeeded = Just 0
319 , _scst_failed = Just 0
320 , _scst_remaining = Just 1
321 , _scst_events = Just []
322 }
323 _r <- csvPost l m
324
325 pure JobLog { _scst_succeeded = Just 1
326 , _scst_failed = Just 0
327 , _scst_remaining = Just 0
328 , _scst_events = Just []
329 }
330 ------------------------------------------------------------------------