]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Ngrams/List.hs
Merge branch 'dev-optim' into dev
[gargantext.git] / src / Gargantext / API / Ngrams / List.hs
1 {-|
2 Module : Gargantext.API.Ngrams.List
3 Description : Get Ngrams (lists)
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE TemplateHaskell #-}
13 {-# LANGUAGE TypeOperators #-}
14
15 module Gargantext.API.Ngrams.List
16 where
17
18 import Control.Lens hiding (elements, Indexed)
19 import Data.Aeson
20 import Data.Either (Either(..))
21 import Data.HashMap.Strict (HashMap)
22 import Data.Map (Map, toList, fromList)
23 import Data.Maybe (catMaybes)
24 import Data.Set (Set)
25 import Data.Text (Text, concat, pack)
26 import Data.Vector (Vector)
27 import Gargantext.API.Admin.Orchestrator.Types
28 import Gargantext.API.Ngrams (getNgramsTableMap, setListNgrams)
29 import Gargantext.API.Ngrams.Tools (getTermsWith)
30 import Gargantext.API.Ngrams.Types
31 import Gargantext.API.Ngrams.List.Types
32 import Gargantext.API.Prelude (GargServer)
33 import Gargantext.Core.NodeStory
34 import Gargantext.Core.Text.Terms (ExtractedNgrams(..))
35 import Gargantext.Core.Text.Terms.WithList (buildPatterns, termsInText)
36 import Gargantext.Core.Types.Main (ListType(..))
37 import Gargantext.Database.Action.Flow (saveDocNgramsWith)
38 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
39 import Gargantext.Database.Action.Metrics.NgramsByNode (getOccByNgramsOnlyFast')
40 import Gargantext.Database.Admin.Types.Hyperdata.Document
41 import Gargantext.Database.Admin.Types.Node
42 import Gargantext.Database.Query.Table.NodeNode (selectDocNodes)
43 import Gargantext.Database.Schema.Ngrams
44 import Gargantext.Database.Schema.Node
45 import Gargantext.Database.Types (Indexed(..))
46 import Gargantext.Prelude
47 import Network.HTTP.Media ((//), (/:))
48 import Servant
49 import Servant.Job.Async
50 import qualified Data.ByteString.Lazy as BSL
51 import qualified Data.Csv as Csv
52 import qualified Data.HashMap.Strict as HashMap
53 import qualified Data.List as List
54 import qualified Data.Map as Map
55 import qualified Data.Text as Text
56 import qualified Data.Vector as Vec
57 import qualified Prelude as Prelude
58 import qualified Protolude as P
59 ------------------------------------------------------------------------
60 -- | TODO refactor
61 {-
62 type API = Get '[JSON, HTML] (Headers '[Header "Content-Disposition" Text] NgramsList)
63 -- :<|> ReqBody '[JSON] NgramsList :> Post '[JSON] Bool
64 :<|> PostAPI
65 :<|> CSVPostAPI
66 api :: ListId -> GargServer API
67 api l = get l :<|> postAsync l :<|> csvPostAsync l
68 -}
69
70 ----------------------
71 type GETAPI = Summary "Get List"
72 :> "lists"
73 :> Capture "listId" ListId
74 :> Get '[JSON, HTML] (Headers '[Header "Content-Disposition" Text] NgramsList)
75 getApi :: GargServer GETAPI
76 getApi = get
77
78 data HTML
79 instance Accept HTML where
80 contentType _ = "text" // "html" /: ("charset", "utf-8")
81 instance ToJSON a => MimeRender HTML a where
82 mimeRender _ = encode
83
84 ----------------------
85 type JSONAPI = Summary "Update List"
86 :> "lists"
87 :> Capture "listId" ListId
88 :> "add"
89 :> "form"
90 :> "async"
91 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
92
93 jsonApi :: GargServer JSONAPI
94 jsonApi = postAsync
95
96 ----------------------
97 type CSVAPI = Summary "Update List (legacy v3 CSV)"
98 :> "lists"
99 :> Capture "listId" ListId
100 :> "csv"
101 :> "add"
102 :> "form"
103 :> "async"
104 :> AsyncJobs JobLog '[FormUrlEncoded] WithTextFile JobLog
105
106 csvApi :: GargServer CSVAPI
107 csvApi = csvPostAsync
108
109 ------------------------------------------------------------------------
110 get :: HasNodeStory env err m =>
111 ListId -> m (Headers '[Header "Content-Disposition" Text] NgramsList)
112 get lId = do
113 lst <- get' lId
114 let (NodeId id') = lId
115 return $ addHeader (concat [ "attachment; filename=GarganText_NgramsList-"
116 , pack $ show id'
117 , ".json"
118 ]
119 ) lst
120
121 get' :: HasNodeStory env err m
122 => ListId -> m NgramsList
123 get' lId = fromList
124 <$> zip ngramsTypes
125 <$> mapM (getNgramsTableMap lId) ngramsTypes
126
127 ------------------------------------------------------------------------
128 -- TODO : purge list
129 -- TODO talk
130 post :: FlowCmdM env err m
131 => ListId
132 -> NgramsList
133 -> m Bool
134 post l m = do
135 -- TODO check with Version for optim
136 printDebug "New list as file" l
137 _ <- mapM (\(nt, Versioned _v ns) -> setListNgrams l nt ns) $ toList m
138 -- TODO reindex
139 pure True
140
141 ------------------------------------------------------------------------
142 -- | Re-index documents of a corpus with new ngrams (called orphans here)
143 reIndexWith :: ( HasNodeStory env err m
144 , FlowCmdM env err m
145 )
146 => CorpusId
147 -> ListId
148 -> NgramsType
149 -> Set ListType
150 -> m ()
151 reIndexWith cId lId nt lts = do
152 -- Getting [NgramsTerm]
153 ts <- List.concat
154 <$> map (\(k,vs) -> k:vs)
155 <$> HashMap.toList
156 <$> getTermsWith identity [lId] nt lts
157
158 -- printDebug "ts" ts
159
160 -- Taking the ngrams with 0 occurrences only (orphans)
161 occs <- getOccByNgramsOnlyFast' cId lId nt ts
162
163 -- printDebug "occs" occs
164
165 let orphans = List.concat
166 $ map (\t -> case HashMap.lookup t occs of
167 Nothing -> [t]
168 Just n -> if n <= 1 then [t] else [ ]
169 ) ts
170
171 -- printDebug "orphans" orphans
172
173 -- Get all documents of the corpus
174 docs <- selectDocNodes cId
175 -- printDebug "docs length" (List.length docs)
176
177 -- Checking Text documents where orphans match
178 -- TODO Tests here
179 let
180 ngramsByDoc = map (HashMap.fromList)
181 $ map (map (\(k,v) -> (SimpleNgrams (text2ngrams k), v)))
182 $ map (\doc -> List.zip
183 (termsInText (buildPatterns $ map (\k -> (Text.splitOn " " $ unNgramsTerm k, [])) orphans)
184 $ Text.unlines $ catMaybes
185 [ doc ^. node_hyperdata . hd_title
186 , doc ^. node_hyperdata . hd_abstract
187 ]
188 )
189 (List.cycle [Map.fromList $ [(nt, Map.singleton (doc ^. node_id) 1 )]])
190 ) docs
191
192 -- printDebug "ngramsByDoc" ngramsByDoc
193
194 -- Saving the indexation in database
195 _ <- mapM (saveDocNgramsWith lId) ngramsByDoc
196
197 pure () -- ngramsByDoc
198
199 toIndexedNgrams :: HashMap Text NgramsId -> Text -> Maybe (Indexed Int Ngrams)
200 toIndexedNgrams m t = Indexed <$> i <*> n
201 where
202 i = HashMap.lookup t m
203 n = Just (text2ngrams t)
204
205 ------------------------------------------------------------------------
206 type PostAPI = Summary "Update List"
207 :> "add"
208 :> "form"
209 :> "async"
210 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
211
212 postAsync :: GargServer JSONAPI
213 postAsync lId =
214 serveJobsAPI $
215 JobFunction (\f log' ->
216 let
217 log'' x = do
218 printDebug "postAsync ListId" x
219 liftBase $ log' x
220 in postAsync' lId f log'')
221
222 postAsync' :: FlowCmdM env err m
223 => ListId
224 -> WithFile
225 -> (JobLog -> m ())
226 -> m JobLog
227 postAsync' l (WithFile _ m _) logStatus = do
228
229 logStatus JobLog { _scst_succeeded = Just 0
230 , _scst_failed = Just 0
231 , _scst_remaining = Just 1
232 , _scst_events = Just []
233 }
234 printDebug "New list as file" l
235 _ <- post l m
236 -- printDebug "Done" r
237
238 pure JobLog { _scst_succeeded = Just 1
239 , _scst_failed = Just 0
240 , _scst_remaining = Just 0
241 , _scst_events = Just []
242 }
243 ------------------------------------------------------------------------
244
245 type CSVPostAPI = Summary "Update List (legacy v3 CSV)"
246 :> "csv"
247 :> "add"
248 :> "form"
249 :> "async"
250 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
251
252 readCsvText :: Text -> [(Text, Text, Text)]
253 readCsvText t = case eDec of
254 Left _ -> []
255 Right dec -> Vec.toList dec
256 where
257 lt = BSL.fromStrict $ P.encodeUtf8 t
258 eDec = Csv.decodeWith
259 (Csv.defaultDecodeOptions { Csv.decDelimiter = fromIntegral (P.ord '\t') })
260 Csv.HasHeader lt :: Either Prelude.String (Vector (Text, Text, Text))
261
262 parseCsvData :: [(Text, Text, Text)] -> Map NgramsTerm NgramsRepoElement
263 parseCsvData lst = Map.fromList $ conv <$> lst
264 where
265 conv (_status, label, _forms) =
266 (NgramsTerm label, NgramsRepoElement { _nre_size = 1
267 , _nre_list = CandidateTerm
268 , _nre_root = Nothing
269 , _nre_parent = Nothing
270 , _nre_children = MSet Map.empty })
271
272 csvPost :: FlowCmdM env err m
273 => ListId
274 -> Text
275 -> m Bool
276 csvPost l m = do
277 printDebug "[csvPost] l" l
278 -- printDebug "[csvPost] m" m
279 -- status label forms
280 let lst = readCsvText m
281 let p = parseCsvData lst
282 --printDebug "[csvPost] lst" lst
283 printDebug "[csvPost] p" p
284 _ <- setListNgrams l NgramsTerms p
285 pure True
286 ------------------------------------------------------------------------
287
288
289
290 csvPostAsync :: GargServer CSVAPI
291 csvPostAsync lId =
292 serveJobsAPI $
293 JobFunction $ \f@(WithTextFile ft _ n) log' -> do
294 let log'' x = do
295 printDebug "[csvPostAsync] filetype" ft
296 printDebug "[csvPostAsync] name" n
297 liftBase $ log' x
298 csvPostAsync' lId f log''
299
300
301 csvPostAsync' :: FlowCmdM env err m
302 => ListId
303 -> WithTextFile
304 -> (JobLog -> m ())
305 -> m JobLog
306 csvPostAsync' l (WithTextFile _ m _) logStatus = do
307 logStatus JobLog { _scst_succeeded = Just 0
308 , _scst_failed = Just 0
309 , _scst_remaining = Just 1
310 , _scst_events = Just []
311 }
312 _r <- csvPost l m
313
314 pure JobLog { _scst_succeeded = Just 1
315 , _scst_failed = Just 0
316 , _scst_remaining = Just 0
317 , _scst_events = Just []
318 }
319 ------------------------------------------------------------------------