]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Ngrams/List.hs
[CsvDoc] implement Maybe for year/month/date
[gargantext.git] / src / Gargantext / API / Ngrams / List.hs
1 {-|
2 Module : Gargantext.API.Ngrams.List
3 Description : Get Ngrams (lists)
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE MonoLocalBinds #-}
13 {-# LANGUAGE TemplateHaskell #-}
14 {-# LANGUAGE TypeOperators #-}
15
16 module Gargantext.API.Ngrams.List
17 where
18
19 import Control.Lens hiding (elements, Indexed)
20 import Data.Aeson
21 import qualified Data.ByteString.Lazy as BSL
22 import qualified Data.Csv as Csv
23 import Data.Either (Either(..))
24 import Data.HashMap.Strict (HashMap)
25 import qualified Data.HashMap.Strict as HashMap
26 import qualified Data.List as List
27 import Data.Map (Map, toList, fromList)
28 import qualified Data.Map as Map
29 import Data.Maybe (catMaybes)
30 import Data.Set (Set)
31 import Data.Text (Text, concat, pack)
32 import qualified Data.Text as Text
33 import Data.Vector (Vector)
34 import qualified Data.Vector as Vec
35 import Network.HTTP.Media ((//), (/:))
36 import qualified Prelude as Prelude
37 import Servant
38 import Servant.Job.Async
39
40 import qualified Protolude as P
41
42 import Gargantext.API.Admin.Orchestrator.Types
43 import Gargantext.API.Ngrams (getNgramsTableMap, setListNgrams)
44 import Gargantext.API.Ngrams.List.Types
45 import Gargantext.API.Ngrams.Tools (getTermsWith)
46 import Gargantext.API.Ngrams.Types
47 import Gargantext.API.Prelude (GargServer)
48 import Gargantext.Core.Text.Terms (ExtractedNgrams(..))
49 import Gargantext.Core.Text.Terms.WithList (buildPatterns, termsInText)
50 import Gargantext.Core.Types.Main (ListType(..))
51 import Gargantext.Database.Action.Flow (saveDocNgramsWith)
52 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
53 import Gargantext.Database.Action.Metrics.NgramsByNode (getOccByNgramsOnlyFast')
54 import Gargantext.Database.Admin.Types.Hyperdata.Document
55 import Gargantext.Database.Admin.Types.Node
56 import Gargantext.Database.Query.Table.NodeNode (selectDocNodes)
57 import Gargantext.Database.Schema.Ngrams
58 import Gargantext.Database.Schema.Node
59 import Gargantext.Database.Types (Indexed(..))
60 import Gargantext.Prelude
61
62 ------------------------------------------------------------------------
63 {-
64 -- | TODO refactor
65 type API = Get '[JSON, HTML] (Headers '[Header "Content-Disposition" Text] NgramsList)
66 -- :<|> ReqBody '[JSON] NgramsList :> Post '[JSON] Bool
67 :<|> PostAPI
68 :<|> CSVPostAPI
69
70 api :: ListId -> GargServer API
71 api l = get l :<|> postAsync l :<|> csvPostAsync l
72 -}
73
74
75 ----------------------
76 type GETAPI = Summary "Get List"
77 :> "lists"
78 :> Capture "listId" ListId
79 :> Get '[JSON, HTML] (Headers '[Header "Content-Disposition" Text] NgramsList)
80 getApi :: GargServer GETAPI
81 getApi = get
82
83 data HTML
84 instance Accept HTML where
85 contentType _ = "text" // "html" /: ("charset", "utf-8")
86 instance ToJSON a => MimeRender HTML a where
87 mimeRender _ = encode
88
89
90 ----------------------
91 type JSONAPI = Summary "Update List"
92 :> "lists"
93 :> Capture "listId" ListId
94 :> "add"
95 :> "form"
96 :> "async"
97 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
98
99 jsonApi :: GargServer JSONAPI
100 jsonApi = postAsync
101
102 ----------------------
103 type CSVAPI = Summary "Update List (legacy v3 CSV)"
104 :> "lists"
105 :> Capture "listId" ListId
106 :> "csv"
107 :> "add"
108 :> "form"
109 :> "async"
110 :> AsyncJobs JobLog '[FormUrlEncoded] WithTextFile JobLog
111
112 csvApi :: GargServer CSVAPI
113 csvApi = csvPostAsync
114
115 ----------------------
116
117
118
119 ------------------------------------------------------------------------
120 get :: RepoCmdM env err m =>
121 ListId -> m (Headers '[Header "Content-Disposition" Text] NgramsList)
122 get lId = do
123 lst <- get' lId
124 let (NodeId id') = lId
125 return $ addHeader (concat [ "attachment; filename=GarganText_NgramsList-"
126 , pack $ show id'
127 , ".json"
128 ]
129 ) lst
130
131 get' :: RepoCmdM env err m
132 => ListId -> m NgramsList
133 get' lId = fromList
134 <$> zip ngramsTypes
135 <$> mapM (getNgramsTableMap lId) ngramsTypes
136
137 ------------------------------------------------------------------------
138 -- TODO : purge list
139 -- TODO talk
140 post :: FlowCmdM env err m
141 => ListId
142 -> NgramsList
143 -> m Bool
144 post l m = do
145 -- TODO check with Version for optim
146 printDebug "New list as file" l
147 _ <- mapM (\(nt, Versioned _v ns) -> setListNgrams l nt ns) $ toList m
148 -- TODO reindex
149 pure True
150
151
152 -----------------------------------------------------------------------------
153 -- | Re-index documents of a corpus with new ngrams (called orphans here)
154 reIndexWith :: ( HasRepo env
155 , FlowCmdM env err m
156 )
157 => CorpusId
158 -> ListId
159 -> NgramsType
160 -> Set ListType
161 -> m ()
162 reIndexWith cId lId nt lts = do
163 -- Getting [NgramsTerm]
164 ts <- List.concat
165 <$> map (\(k,vs) -> k:vs)
166 <$> HashMap.toList
167 <$> getTermsWith identity [lId] nt lts
168
169 -- printDebug "ts" ts
170
171 -- Taking the ngrams with 0 occurrences only (orphans)
172 occs <- getOccByNgramsOnlyFast' cId lId nt ts
173
174 -- printDebug "occs" occs
175
176 let orphans = List.concat
177 $ map (\t -> case HashMap.lookup t occs of
178 Nothing -> [t]
179 Just n -> if n <= 1 then [t] else [ ]
180 ) ts
181
182 -- printDebug "orphans" orphans
183
184 -- Get all documents of the corpus
185 docs <- selectDocNodes cId
186 -- printDebug "docs length" (List.length docs)
187
188 -- Checking Text documents where orphans match
189 -- TODO Tests here
190 let
191 ngramsByDoc = map (HashMap.fromList)
192 $ map (map (\(k,v) -> (SimpleNgrams (text2ngrams k), v)))
193 $ map (\doc -> List.zip
194 (termsInText (buildPatterns $ map (\k -> (Text.splitOn " " $ unNgramsTerm k, [])) orphans)
195 $ Text.unlines $ catMaybes
196 [ doc ^. node_hyperdata . hd_title
197 , doc ^. node_hyperdata . hd_abstract
198 ]
199 )
200 (List.cycle [Map.fromList $ [(nt, Map.singleton (doc ^. node_id) 1 )]])
201 ) docs
202
203 -- printDebug "ngramsByDoc" ngramsByDoc
204
205 -- Saving the indexation in database
206 _ <- mapM (saveDocNgramsWith lId) ngramsByDoc
207
208 pure () -- ngramsByDoc
209
210 toIndexedNgrams :: HashMap Text NgramsId -> Text -> Maybe (Indexed Int Ngrams)
211 toIndexedNgrams m t = Indexed <$> i <*> n
212 where
213 i = HashMap.lookup t m
214 n = Just (text2ngrams t)
215
216 ------------------------------------------------------------------------
217 type PostAPI = Summary "Update List"
218 :> "add"
219 :> "form"
220 :> "async"
221 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
222
223 postAsync :: GargServer JSONAPI
224 postAsync lId =
225 serveJobsAPI $
226 JobFunction (\f log' ->
227 let
228 log'' x = do
229 printDebug "postAsync ListId" x
230 liftBase $ log' x
231 in postAsync' lId f log'')
232
233 postAsync' :: FlowCmdM env err m
234 => ListId
235 -> WithFile
236 -> (JobLog -> m ())
237 -> m JobLog
238 postAsync' l (WithFile _ m _) logStatus = do
239
240 logStatus JobLog { _scst_succeeded = Just 0
241 , _scst_failed = Just 0
242 , _scst_remaining = Just 1
243 , _scst_events = Just []
244 }
245 printDebug "New list as file" l
246 _ <- post l m
247 -- printDebug "Done" r
248
249 pure JobLog { _scst_succeeded = Just 1
250 , _scst_failed = Just 0
251 , _scst_remaining = Just 0
252 , _scst_events = Just []
253 }
254 ------------------------------------------------------------------------
255 readCsvText :: Text -> [(Text, Text, Text)]
256 readCsvText t = case eDec of
257 Left _ -> []
258 Right dec -> Vec.toList dec
259 where
260 lt = BSL.fromStrict $ P.encodeUtf8 t
261 eDec = Csv.decodeWith
262 (Csv.defaultDecodeOptions { Csv.decDelimiter = fromIntegral (P.ord '\t') })
263 Csv.HasHeader lt :: Either Prelude.String (Vector (Text, Text, Text))
264
265 parseCsvData :: [(Text, Text, Text)] -> Map NgramsTerm NgramsRepoElement
266 parseCsvData lst = Map.fromList $ conv <$> lst
267 where
268 conv (_status, label, _forms) =
269 (NgramsTerm label, NgramsRepoElement { _nre_size = 1
270 , _nre_list = CandidateTerm
271 , _nre_root = Nothing
272 , _nre_parent = Nothing
273 , _nre_children = MSet Map.empty })
274
275 csvPost :: FlowCmdM env err m
276 => ListId
277 -> Text
278 -> m Bool
279 csvPost l m = do
280 printDebug "[csvPost] l" l
281 -- printDebug "[csvPost] m" m
282 -- status label forms
283 let lst = readCsvText m
284 let p = parseCsvData lst
285 --printDebug "[csvPost] lst" lst
286 printDebug "[csvPost] p" p
287 _ <- setListNgrams l NgramsTerms p
288 pure True
289 ------------------------------------------------------------------------
290
291
292
293 csvPostAsync :: GargServer CSVAPI
294 csvPostAsync lId =
295 serveJobsAPI $
296 JobFunction $ \f@(WithTextFile ft _ n) log' -> do
297 let log'' x = do
298 printDebug "[csvPostAsync] filetype" ft
299 printDebug "[csvPostAsync] name" n
300 liftBase $ log' x
301 csvPostAsync' lId f log''
302
303 csvPostAsync' :: FlowCmdM env err m
304 => ListId
305 -> WithTextFile
306 -> (JobLog -> m ())
307 -> m JobLog
308 csvPostAsync' l (WithTextFile _ m _) logStatus = do
309 logStatus JobLog { _scst_succeeded = Just 0
310 , _scst_failed = Just 0
311 , _scst_remaining = Just 1
312 , _scst_events = Just []
313 }
314 _r <- csvPost l m
315
316 pure JobLog { _scst_succeeded = Just 1
317 , _scst_failed = Just 0
318 , _scst_remaining = Just 0
319 , _scst_events = Just []
320 }
321
322 ------------------------------------------------------------------------