]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Ngrams/List.hs
[VERSION] +1 to 0.0.5.6.4
[gargantext.git] / src / Gargantext / API / Ngrams / List.hs
1 {-|
2 Module : Gargantext.API.Ngrams.List
3 Description : Get Ngrams (lists)
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE TemplateHaskell #-}
13 {-# LANGUAGE TypeOperators #-}
14
15 module Gargantext.API.Ngrams.List
16 where
17
18 import Control.Lens hiding (elements, Indexed)
19 import Data.Aeson
20 import Data.Either (Either(..))
21 import Data.HashMap.Strict (HashMap)
22 import Data.Map (Map, toList)
23 import Data.Maybe (catMaybes)
24 import Data.Set (Set)
25 import Data.Text (Text, concat, pack)
26 import Data.Vector (Vector)
27 import Gargantext.API.Admin.Orchestrator.Types
28 import Gargantext.API.Ngrams (setListNgrams)
29 import Gargantext.API.Ngrams.Tools (getTermsWith)
30 import Gargantext.API.Ngrams.Types
31 import Gargantext.API.Ngrams.Prelude (getNgramsList)
32 import Gargantext.API.Ngrams.List.Types
33 import Gargantext.API.Prelude (GargServer)
34 import Gargantext.Core.NodeStory
35 import Gargantext.Core.Text.Terms (ExtractedNgrams(..))
36 import Gargantext.Core.Text.Terms.WithList (buildPatterns, termsInText)
37 import Gargantext.Core.Types.Main (ListType(..))
38 import Gargantext.Database.Action.Flow (saveDocNgramsWith)
39 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
40 import Gargantext.Database.Action.Metrics.NgramsByContext (getOccByNgramsOnlyFast')
41 import Gargantext.Database.Admin.Types.Hyperdata.Document
42 import Gargantext.Database.Admin.Types.Node
43 import Gargantext.Database.Query.Table.NodeContext (selectDocNodes)
44 import Gargantext.Database.Schema.Ngrams
45 import Gargantext.Database.Schema.Context
46 import Gargantext.Database.Types (Indexed(..))
47 import Gargantext.Prelude
48 import Network.HTTP.Media ((//), (/:))
49 import Servant
50 import Servant.Job.Async
51 import qualified Data.ByteString.Lazy as BSL
52 import qualified Data.Csv as Csv
53 import qualified Data.HashMap.Strict as HashMap
54 import qualified Data.List as List
55 import qualified Data.Map as Map
56 import qualified Data.Text as Text
57 import qualified Data.Vector as Vec
58 import qualified Prelude as Prelude
59 import qualified Protolude as P
60 ------------------------------------------------------------------------
61 -- | TODO refactor
62 {-
63 type API = Get '[JSON, HTML] (Headers '[Header "Content-Disposition" Text] NgramsList)
64 -- :<|> ReqBody '[JSON] NgramsList :> Post '[JSON] Bool
65 :<|> PostAPI
66 :<|> CSVPostAPI
67 api :: ListId -> GargServer API
68 api l = get l :<|> postAsync l :<|> csvPostAsync l
69 -}
70
71 ----------------------
72 type GETAPI = Summary "Get List"
73 :> "lists"
74 :> Capture "listId" ListId
75 :> Get '[JSON, HTML] (Headers '[Header "Content-Disposition" Text] NgramsList)
76 getApi :: GargServer GETAPI
77 getApi = get
78
79 data HTML
80 instance Accept HTML where
81 contentType _ = "text" // "html" /: ("charset", "utf-8")
82 instance ToJSON a => MimeRender HTML a where
83 mimeRender _ = encode
84
85 ----------------------
86 type JSONAPI = Summary "Update List"
87 :> "lists"
88 :> Capture "listId" ListId
89 :> "add"
90 :> "form"
91 :> "async"
92 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
93
94 jsonApi :: GargServer JSONAPI
95 jsonApi = postAsync
96
97 ----------------------
98 type CSVAPI = Summary "Update List (legacy v3 CSV)"
99 :> "lists"
100 :> Capture "listId" ListId
101 :> "csv"
102 :> "add"
103 :> "form"
104 :> "async"
105 :> AsyncJobs JobLog '[FormUrlEncoded] WithTextFile JobLog
106
107 csvApi :: GargServer CSVAPI
108 csvApi = csvPostAsync
109
110 ------------------------------------------------------------------------
111 get :: HasNodeStory env err m =>
112 ListId -> m (Headers '[Header "Content-Disposition" Text] NgramsList)
113 get lId = do
114 lst <- getNgramsList lId
115 let (NodeId id') = lId
116 return $ addHeader (concat [ "attachment; filename=GarganText_NgramsList-"
117 , pack $ show id'
118 , ".json"
119 ]
120 ) lst
121
122 ------------------------------------------------------------------------
123 -- TODO : purge list
124 -- TODO talk
125 post :: FlowCmdM env err m
126 => ListId
127 -> NgramsList
128 -> m Bool
129 post l m = do
130 -- TODO check with Version for optim
131 printDebug "New list as file" l
132 _ <- mapM (\(nt, Versioned _v ns) -> setListNgrams l nt ns) $ toList m
133 -- TODO reindex
134 pure True
135
136 ------------------------------------------------------------------------
137 -- | Re-index documents of a corpus with new ngrams (called orphans here)
138 reIndexWith :: ( HasNodeStory env err m
139 , FlowCmdM env err m
140 )
141 => CorpusId
142 -> ListId
143 -> NgramsType
144 -> Set ListType
145 -> m ()
146 reIndexWith cId lId nt lts = do
147 -- Getting [NgramsTerm]
148 ts <- List.concat
149 <$> map (\(k,vs) -> k:vs)
150 <$> HashMap.toList
151 <$> getTermsWith identity [lId] nt lts
152
153 printDebug "ts" ts
154
155 -- Taking the ngrams with 0 occurrences only (orphans)
156 occs <- getOccByNgramsOnlyFast' cId lId nt ts
157
158 printDebug "occs" occs
159
160 let orphans = List.concat
161 $ map (\t -> case HashMap.lookup t occs of
162 Nothing -> [t]
163 Just n -> if n <= 1 then [t] else [ ]
164 ) ts
165
166 printDebug "orphans" orphans
167
168 -- Get all documents of the corpus
169 docs <- selectDocNodes cId
170 printDebug "docs length" (List.length docs)
171
172 -- Checking Text documents where orphans match
173 -- TODO Tests here
174 let
175 ngramsByDoc = map (HashMap.fromList)
176 $ map (map (\(k,v) -> (SimpleNgrams (text2ngrams k), v)))
177 $ map (\doc -> List.zip
178 (termsInText (buildPatterns $ map (\k -> (Text.splitOn " " $ unNgramsTerm k, [])) orphans)
179 $ Text.unlines $ catMaybes
180 [ doc ^. context_hyperdata . hd_title
181 , doc ^. context_hyperdata . hd_abstract
182 ]
183 )
184 (List.cycle [Map.fromList $ [(nt, Map.singleton (doc ^. context_id) 1 )]])
185 ) docs
186
187 printDebug "ngramsByDoc" ngramsByDoc
188
189 -- Saving the indexation in database
190 _ <- mapM (saveDocNgramsWith lId) ngramsByDoc
191
192 pure () -- ngramsByDoc
193
194 toIndexedNgrams :: HashMap Text NgramsId -> Text -> Maybe (Indexed Int Ngrams)
195 toIndexedNgrams m t = Indexed <$> i <*> n
196 where
197 i = HashMap.lookup t m
198 n = Just (text2ngrams t)
199
200 ------------------------------------------------------------------------
201 type PostAPI = Summary "Update List"
202 :> "add"
203 :> "form"
204 :> "async"
205 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
206
207 postAsync :: GargServer JSONAPI
208 postAsync lId =
209 serveJobsAPI $
210 JobFunction (\f log' ->
211 let
212 log'' x = do
213 printDebug "postAsync ListId" x
214 liftBase $ log' x
215 in postAsync' lId f log'')
216
217 postAsync' :: FlowCmdM env err m
218 => ListId
219 -> WithFile
220 -> (JobLog -> m ())
221 -> m JobLog
222 postAsync' l (WithFile _ m _) logStatus = do
223
224 logStatus JobLog { _scst_succeeded = Just 0
225 , _scst_failed = Just 0
226 , _scst_remaining = Just 1
227 , _scst_events = Just []
228 }
229 printDebug "New list as file" l
230 _ <- post l m
231 -- printDebug "Done" r
232
233 pure JobLog { _scst_succeeded = Just 1
234 , _scst_failed = Just 0
235 , _scst_remaining = Just 0
236 , _scst_events = Just []
237 }
238 ------------------------------------------------------------------------
239
240 type CSVPostAPI = Summary "Update List (legacy v3 CSV)"
241 :> "csv"
242 :> "add"
243 :> "form"
244 :> "async"
245 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
246
247 readCsvText :: Text -> [(Text, Text, Text)]
248 readCsvText t = case eDec of
249 Left _ -> []
250 Right dec -> Vec.toList dec
251 where
252 lt = BSL.fromStrict $ P.encodeUtf8 t
253 eDec = Csv.decodeWith
254 (Csv.defaultDecodeOptions { Csv.decDelimiter = fromIntegral (P.ord '\t') })
255 Csv.HasHeader lt :: Either Prelude.String (Vector (Text, Text, Text))
256
257 parseCsvData :: [(Text, Text, Text)] -> Map NgramsTerm NgramsRepoElement
258 parseCsvData lst = Map.fromList $ conv <$> lst
259 where
260 conv (_status, label, _forms) =
261 (NgramsTerm label, NgramsRepoElement { _nre_size = 1
262 , _nre_list = CandidateTerm
263 , _nre_root = Nothing
264 , _nre_parent = Nothing
265 , _nre_children = MSet Map.empty })
266
267 csvPost :: FlowCmdM env err m
268 => ListId
269 -> Text
270 -> m Bool
271 csvPost l m = do
272 printDebug "[csvPost] l" l
273 -- printDebug "[csvPost] m" m
274 -- status label forms
275 let lst = readCsvText m
276 let p = parseCsvData lst
277 --printDebug "[csvPost] lst" lst
278 printDebug "[csvPost] p" p
279 _ <- setListNgrams l NgramsTerms p
280 pure True
281 ------------------------------------------------------------------------
282
283
284
285 csvPostAsync :: GargServer CSVAPI
286 csvPostAsync lId =
287 serveJobsAPI $
288 JobFunction $ \f@(WithTextFile ft _ n) log' -> do
289 let log'' x = do
290 printDebug "[csvPostAsync] filetype" ft
291 printDebug "[csvPostAsync] name" n
292 liftBase $ log' x
293 csvPostAsync' lId f log''
294
295
296 csvPostAsync' :: FlowCmdM env err m
297 => ListId
298 -> WithTextFile
299 -> (JobLog -> m ())
300 -> m JobLog
301 csvPostAsync' l (WithTextFile _ m _) logStatus = do
302 logStatus JobLog { _scst_succeeded = Just 0
303 , _scst_failed = Just 0
304 , _scst_remaining = Just 1
305 , _scst_events = Just []
306 }
307 _r <- csvPost l m
308
309 pure JobLog { _scst_succeeded = Just 1
310 , _scst_failed = Just 0
311 , _scst_remaining = Just 0
312 , _scst_events = Just []
313 }
314 ------------------------------------------------------------------------