]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Ngrams/List.hs
[csv] more work on making the parser accept looser input
[gargantext.git] / src / Gargantext / API / Ngrams / List.hs
1 {-|
2 Module : Gargantext.API.Ngrams.List
3 Description : Get Ngrams (lists)
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE TemplateHaskell #-}
13 {-# LANGUAGE TypeOperators #-}
14
15 module Gargantext.API.Ngrams.List
16 where
17
18 import Control.Lens hiding (elements, Indexed)
19 import Data.Aeson
20 import Data.HashMap.Strict (HashMap)
21 import Data.Map (toList, fromList)
22 import Data.Maybe (catMaybes)
23 import Data.Set (Set)
24 import Data.Swagger (ToSchema, declareNamedSchema, genericDeclareNamedSchema)
25 import Data.Text (Text, concat, pack)
26 import GHC.Generics (Generic)
27 import Network.HTTP.Media ((//), (/:))
28 import Servant
29 import Servant.Job.Async
30 import Servant.Job.Utils (jsonOptions)
31 import Web.FormUrlEncoded (FromForm)
32 import qualified Data.HashMap.Strict as HashMap
33 import qualified Data.List as List
34 import qualified Data.Map as Map
35 import qualified Data.Text as Text
36
37 import Gargantext.API.Admin.Orchestrator.Types
38 import Gargantext.API.Ngrams (getNgramsTableMap, setListNgrams)
39 import Gargantext.API.Ngrams.Tools (getTermsWith)
40 import Gargantext.API.Ngrams.Types
41 import Gargantext.API.Node.Corpus.New.File (FileType(..))
42 import Gargantext.API.Prelude (GargServer)
43 import Gargantext.Core.Text.Terms (ExtractedNgrams(..))
44 import Gargantext.Core.Text.Terms.WithList (buildPatterns, termsInText)
45 import Gargantext.Core.Types.Main (ListType(..))
46 import Gargantext.Core.Utils.Prefix (unPrefixSwagger)
47 import Gargantext.Database.Action.Flow (saveDocNgramsWith)
48 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
49 import Gargantext.Database.Action.Metrics.NgramsByNode (getOccByNgramsOnlyFast')
50 import Gargantext.Database.Admin.Types.Hyperdata.Document
51 import Gargantext.Database.Admin.Types.Node
52 import Gargantext.Database.Query.Table.NodeNode (selectDocNodes)
53 import Gargantext.Database.Schema.Ngrams
54 import Gargantext.Database.Schema.Node
55 import Gargantext.Database.Types (Indexed(..))
56 import Gargantext.Prelude
57
58 ------------------------------------------------------------------------
59 -- | TODO refactor
60 type API = Get '[JSON, HTML] (Headers '[Header "Content-Disposition" Text] NgramsList)
61 -- :<|> ReqBody '[JSON] NgramsList :> Post '[JSON] Bool
62 :<|> PostAPI
63 :<|> CSVPostAPI
64
65 data HTML
66 instance Accept HTML where
67 contentType _ = "text" // "html" /: ("charset", "utf-8")
68 instance ToJSON a => MimeRender HTML a where
69 mimeRender _ = encode
70
71
72 api :: ListId -> GargServer API
73 api l = get l :<|> postAsync l :<|> csvPostAsync l
74
75 ----------------------
76 type GETAPI = Summary "Get List"
77 :> "lists"
78 :> Capture "listId" ListId
79 :> Get '[JSON, HTML] (Headers '[Header "Content-Disposition" Text] NgramsList)
80 getApi :: GargServer GETAPI
81 getApi = get
82
83 ----------------------
84 type JSONAPI = Summary "Update List"
85 :> "lists"
86 :> Capture "listId" ListId
87 :> "add"
88 :> "form"
89 :> "async"
90 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
91
92 jsonApi :: GargServer JSONAPI
93 jsonApi = postAsync
94
95 ----------------------
96 type CSVAPI = Summary "Update List (legacy v3 CSV)"
97 :> "lists"
98 :> Capture "listId" ListId
99 :> "csv"
100 :> "add"
101 :> "form"
102 :> "async"
103 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
104
105 csvApi :: GargServer CSVAPI
106 csvApi = csvPostAsync
107
108 ----------------------
109
110
111
112 ------------------------------------------------------------------------
113 get :: RepoCmdM env err m =>
114 ListId -> m (Headers '[Header "Content-Disposition" Text] NgramsList)
115 get lId = do
116 lst <- get' lId
117 let (NodeId id') = lId
118 return $ addHeader (concat [ "attachment; filename=GarganText_NgramsList-"
119 , pack $ show id'
120 , ".json"
121 ]
122 ) lst
123
124 get' :: RepoCmdM env err m
125 => ListId -> m NgramsList
126 get' lId = fromList
127 <$> zip ngramsTypes
128 <$> mapM (getNgramsTableMap lId) ngramsTypes
129
130 ------------------------------------------------------------------------
131 -- TODO : purge list
132 -- TODO talk
133 post :: FlowCmdM env err m
134 => ListId
135 -> NgramsList
136 -> m Bool
137 post l m = do
138 -- TODO check with Version for optim
139 printDebug "New list as file" l
140 _ <- mapM (\(nt, Versioned _v ns) -> setListNgrams l nt ns) $ toList m
141 -- TODO reindex
142 pure True
143
144 ------------------------------------------------------------------------
145 csvPost :: FlowCmdM env err m
146 => ListId
147 -> NgramsList
148 -> m Bool
149 csvPost l m = do
150 printDebug "[csvPost] l" l
151 printDebug "[csvPost] m" m
152 pure True
153
154 -----------------------------------------------------------------------------
155 -- | Re-index documents of a corpus with new ngrams (called orphans here)
156 reIndexWith :: ( HasRepo env
157 , FlowCmdM env err m
158 )
159 => CorpusId
160 -> ListId
161 -> NgramsType
162 -> Set ListType
163 -> m ()
164 reIndexWith cId lId nt lts = do
165 -- Getting [NgramsTerm]
166 ts <- List.concat
167 <$> map (\(k,vs) -> k:vs)
168 <$> HashMap.toList
169 <$> getTermsWith identity [lId] nt lts
170
171 -- printDebug "ts" ts
172
173 -- Taking the ngrams with 0 occurrences only (orphans)
174 occs <- getOccByNgramsOnlyFast' cId lId nt ts
175
176 -- printDebug "occs" occs
177
178 let orphans = List.concat
179 $ map (\t -> case HashMap.lookup t occs of
180 Nothing -> [t]
181 Just n -> if n <= 1 then [t] else [ ]
182 ) ts
183
184 -- printDebug "orphans" orphans
185
186 -- Get all documents of the corpus
187 docs <- selectDocNodes cId
188 -- printDebug "docs length" (List.length docs)
189
190 -- Checking Text documents where orphans match
191 -- TODO Tests here
192 let
193 ngramsByDoc = map (HashMap.fromList)
194 $ map (map (\(k,v) -> (SimpleNgrams (text2ngrams k), v)))
195 $ map (\doc -> List.zip
196 (termsInText (buildPatterns $ map (\k -> (Text.splitOn " " $ unNgramsTerm k, [])) orphans)
197 $ Text.unlines $ catMaybes
198 [ doc ^. node_hyperdata . hd_title
199 , doc ^. node_hyperdata . hd_abstract
200 ]
201 )
202 (List.cycle [Map.fromList $ [(nt, Map.singleton (doc ^. node_id) 1 )]])
203 ) docs
204
205 -- printDebug "ngramsByDoc" ngramsByDoc
206
207 -- Saving the indexation in database
208 _ <- mapM (saveDocNgramsWith lId) ngramsByDoc
209
210 pure () -- ngramsByDoc
211
212 toIndexedNgrams :: HashMap Text NgramsId -> Text -> Maybe (Indexed Int Ngrams)
213 toIndexedNgrams m t = Indexed <$> i <*> n
214 where
215 i = HashMap.lookup t m
216 n = Just (text2ngrams t)
217
218 ------------------------------------------------------------------------
219 type PostAPI = Summary "Update List"
220 :> "add"
221 :> "form"
222 :> "async"
223 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
224
225 postAsync :: GargServer JSONAPI
226 postAsync lId =
227 serveJobsAPI $
228 JobFunction (\f log' ->
229 let
230 log'' x = do
231 printDebug "postAsync ListId" x
232 liftBase $ log' x
233 in postAsync' lId f log'')
234
235 postAsync' :: FlowCmdM env err m
236 => ListId
237 -> WithFile
238 -> (JobLog -> m ())
239 -> m JobLog
240 postAsync' l (WithFile _ m _) logStatus = do
241
242 logStatus JobLog { _scst_succeeded = Just 0
243 , _scst_failed = Just 0
244 , _scst_remaining = Just 1
245 , _scst_events = Just []
246 }
247 printDebug "New list as file" l
248 _ <- post l m
249 -- printDebug "Done" r
250
251 pure JobLog { _scst_succeeded = Just 1
252 , _scst_failed = Just 0
253 , _scst_remaining = Just 0
254 , _scst_events = Just []
255 }
256 ------------------------------------------------------------------------
257 type CSVPostAPI = Summary "Update List (legacy v3 CSV)"
258 :> "csv"
259 :> "add"
260 :> "form"
261 :> "async"
262 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
263
264 csvPostAsync :: GargServer CSVAPI
265 csvPostAsync lId =
266 serveJobsAPI $
267 JobFunction $ \f@(WithFile ft _ n) log' -> do
268 printDebug "[csvPostAsync] filetype" ft
269 printDebug "[csvPostAsync] name" n
270 csvPostAsync' lId f (liftBase . log')
271
272 csvPostAsync' :: FlowCmdM env err m
273 => ListId
274 -> WithFile
275 -> (JobLog -> m ())
276 -> m JobLog
277 csvPostAsync' l (WithFile _ m _) logStatus = do
278 logStatus JobLog { _scst_succeeded = Just 0
279 , _scst_failed = Just 0
280 , _scst_remaining = Just 1
281 , _scst_events = Just []
282 }
283 _r <- csvPost l m
284
285 pure JobLog { _scst_succeeded = Just 1
286 , _scst_failed = Just 0
287 , _scst_remaining = Just 0
288 , _scst_events = Just []
289 }
290 ------------------------------------------------------------------------
291
292 data WithFile = WithFile
293 { _wf_filetype :: !FileType
294 , _wf_data :: !NgramsList
295 , _wf_name :: !Text
296 } deriving (Eq, Show, Generic)
297
298 makeLenses ''WithFile
299 instance FromForm WithFile
300 instance FromJSON WithFile where
301 parseJSON = genericParseJSON $ jsonOptions "_wf_"
302 instance ToJSON WithFile where
303 toJSON = genericToJSON $ jsonOptions "_wf_"
304 instance ToSchema WithFile where
305 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wf_")