]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Ngrams/List.hs
[ihaskell] some development towards codebook integration
[gargantext.git] / src / Gargantext / API / Ngrams / List.hs
1 {-|
2 Module : Gargantext.API.Ngrams.List
3 Description : Get Ngrams (lists)
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE TemplateHaskell #-}
13 {-# LANGUAGE TypeOperators #-}
14
15 module Gargantext.API.Ngrams.List
16 where
17
18 import Control.Lens hiding (elements, Indexed)
19 import Data.Aeson
20 import Data.Either (Either(..))
21 import Data.HashMap.Strict (HashMap)
22 import Data.Map (Map, toList)
23 import Data.Maybe (catMaybes, fromMaybe)
24 import Data.Set (Set)
25 import Data.Text (Text, concat, pack, splitOn)
26 import Data.Vector (Vector)
27 import Gargantext.API.Admin.Orchestrator.Types
28 import Gargantext.API.Ngrams (setListNgrams)
29 import Gargantext.API.Ngrams.List.Types
30 import Gargantext.API.Ngrams.Prelude (getNgramsList)
31 import Gargantext.API.Ngrams.Tools (getTermsWith)
32 import Gargantext.API.Ngrams.Types
33 import Gargantext.API.Prelude (GargServer)
34 import Gargantext.Core.NodeStory
35 import Gargantext.Core.Text.Terms (ExtractedNgrams(..))
36 import Gargantext.Core.Text.Terms.WithList (buildPatterns, termsInText)
37 import Gargantext.Core.Types.Main (ListType(..))
38 import Gargantext.Database.Action.Flow (saveDocNgramsWith)
39 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
40 import Gargantext.Database.Admin.Types.Hyperdata.Document
41 import Gargantext.Database.Admin.Types.Node
42 import Gargantext.Database.Query.Table.Node (getNode)
43 import Gargantext.Database.Query.Table.NodeContext (selectDocNodes)
44 import Gargantext.Database.Schema.Context
45 import Gargantext.Database.Schema.Ngrams
46 import Gargantext.Database.Schema.Node (_node_parent_id)
47 import Gargantext.Database.Types (Indexed(..))
48 import Gargantext.Prelude
49 import Network.HTTP.Media ((//), (/:))
50 import Servant
51 import Servant.Job.Async
52 import qualified Data.ByteString.Lazy as BSL
53 import qualified Data.Csv as Csv
54 import qualified Data.HashMap.Strict as HashMap
55 import qualified Data.List as List
56 import qualified Data.Map as Map
57 import qualified Data.Set as Set
58 import qualified Data.Text as Text
59 import qualified Data.Vector as Vec
60 import qualified Prelude
61 import qualified Protolude as P
62 ------------------------------------------------------------------------
63 type GETAPI = Summary "Get List"
64 :> "lists"
65 :> Capture "listId" ListId
66 :> Get '[JSON, HTML] (Headers '[Header "Content-Disposition" Text] NgramsList)
67 getApi :: GargServer GETAPI
68 getApi = get
69
70 data HTML
71 instance Accept HTML where
72 contentType _ = "text" // "html" /: ("charset", "utf-8")
73 instance ToJSON a => MimeRender HTML a where
74 mimeRender _ = encode
75
76 ----------------------
77 type JSONAPI = Summary "Update List"
78 :> "lists"
79 :> Capture "listId" ListId
80 :> "add"
81 :> "form"
82 :> "async"
83 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
84
85 jsonApi :: GargServer JSONAPI
86 jsonApi = postAsync
87
88 ----------------------
89 type CSVAPI = Summary "Update List (legacy v3 CSV)"
90 :> "lists"
91 :> Capture "listId" ListId
92 :> "csv"
93 :> "add"
94 :> "form"
95 :> "async"
96 :> AsyncJobs JobLog '[FormUrlEncoded] WithTextFile JobLog
97
98 csvApi :: GargServer CSVAPI
99 csvApi = csvPostAsync
100
101 ------------------------------------------------------------------------
102 get :: HasNodeStory env err m =>
103 ListId -> m (Headers '[Header "Content-Disposition" Text] NgramsList)
104 get lId = do
105 lst <- getNgramsList lId
106 let (NodeId id') = lId
107 return $ addHeader (concat [ "attachment; filename=GarganText_NgramsList-"
108 , pack $ show id'
109 , ".json"
110 ]
111 ) lst
112
113 ------------------------------------------------------------------------
114 -- TODO : purge list
115 -- TODO talk
116 setList :: FlowCmdM env err m
117 => ListId
118 -> NgramsList
119 -> m Bool
120 setList l m = do
121 -- TODO check with Version for optim
122 printDebug "New list as file" l
123 _ <- mapM (\(nt, Versioned _v ns) -> setListNgrams l nt ns) $ toList m
124 -- TODO reindex
125 pure True
126
127 ------------------------------------------------------------------------
128 -- | Re-index documents of a corpus with new ngrams (called orphans here)
129 reIndexWith :: ( HasNodeStory env err m
130 , FlowCmdM env err m
131 )
132 => CorpusId
133 -> ListId
134 -> NgramsType
135 -> Set ListType
136 -> m ()
137 reIndexWith cId lId nt lts = do
138 -- Getting [NgramsTerm]
139 ts <- List.concat
140 <$> map (\(k,vs) -> k:vs)
141 <$> HashMap.toList
142 <$> getTermsWith identity [lId] nt lts
143
144 -- printDebug "ts" ts
145
146 -- Taking the ngrams with 0 occurrences only (orphans)
147 -- occs <- getOccByNgramsOnlyFast' cId lId nt ts
148
149 -- printDebug "occs" occs
150
151 let orphans = ts {- List.concat
152 $ map (\t -> case HashMap.lookup t occs of
153 Nothing -> [t]
154 Just n -> if n <= 1 then [t] else [ ]
155 ) ts
156 -}
157 -- printDebug "orphans" orphans
158
159 -- Get all documents of the corpus
160 docs <- selectDocNodes cId
161 -- printDebug "docs length" (List.length docs)
162
163 -- Checking Text documents where orphans match
164 -- TODO Tests here
165 let
166 ngramsByDoc = map (HashMap.fromList)
167 $ map (map (\(k,v) -> (SimpleNgrams (text2ngrams k), v)))
168 $ map (\doc -> List.zip
169 (termsInText (buildPatterns $ map (\k -> (Text.splitOn " " $ unNgramsTerm k, [])) orphans)
170 $ Text.unlines $ catMaybes
171 [ doc ^. context_hyperdata . hd_title
172 , doc ^. context_hyperdata . hd_abstract
173 ]
174 )
175 (List.cycle [Map.fromList $ [(nt, Map.singleton (doc ^. context_id) 1 )]])
176 ) docs
177
178 -- printDebug "ngramsByDoc" ngramsByDoc
179
180 -- Saving the indexation in database
181 _ <- mapM (saveDocNgramsWith lId) ngramsByDoc
182
183 pure () -- ngramsByDoc
184
185 toIndexedNgrams :: HashMap Text NgramsId -> Text -> Maybe (Indexed Int Ngrams)
186 toIndexedNgrams m t = Indexed <$> i <*> n
187 where
188 i = HashMap.lookup t m
189 n = Just (text2ngrams t)
190
191 ------------------------------------------------------------------------
192 type PostAPI = Summary "Update List"
193 :> "add"
194 :> "form"
195 :> "async"
196 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
197
198 postAsync :: GargServer JSONAPI
199 postAsync lId =
200 serveJobsAPI $
201 JobFunction (\f log' ->
202 let
203 log'' x = do
204 -- printDebug "postAsync ListId" x
205 liftBase $ log' x
206 in postAsync' lId f log'')
207
208 postAsync' :: FlowCmdM env err m
209 => ListId
210 -> WithFile
211 -> (JobLog -> m ())
212 -> m JobLog
213 postAsync' l (WithFile _ m _) logStatus = do
214
215 logStatus JobLog { _scst_succeeded = Just 0
216 , _scst_failed = Just 0
217 , _scst_remaining = Just 2
218 , _scst_events = Just []
219 }
220 printDebug "New list as file" l
221 _ <- setList l m
222 -- printDebug "Done" r
223
224 logStatus JobLog { _scst_succeeded = Just 1
225 , _scst_failed = Just 0
226 , _scst_remaining = Just 1
227 , _scst_events = Just []
228 }
229
230
231 corpus_node <- getNode l -- (Proxy :: Proxy HyperdataList)
232 let corpus_id = fromMaybe (panic "") (_node_parent_id corpus_node)
233 _ <- reIndexWith corpus_id l NgramsTerms (Set.fromList [MapTerm, CandidateTerm])
234
235 pure JobLog { _scst_succeeded = Just 2
236 , _scst_failed = Just 0
237 , _scst_remaining = Just 0
238 , _scst_events = Just []
239 }
240
241
242 ------------------------------------------------------------------------
243 type CSVPostAPI = Summary "Update List (legacy v3 CSV)"
244 :> "csv"
245 :> "add"
246 :> "form"
247 :> "async"
248 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
249
250 readCsvText :: Text -> [(Text, Text, Text)]
251 readCsvText t = case eDec of
252 Left _ -> []
253 Right dec -> Vec.toList dec
254 where
255 lt = BSL.fromStrict $ P.encodeUtf8 t
256 eDec = Csv.decodeWith
257 (Csv.defaultDecodeOptions { Csv.decDelimiter = fromIntegral (P.ord '\t') })
258 Csv.HasHeader lt :: Either Prelude.String (Vector (Text, Text, Text))
259
260 parseCsvData :: [(Text, Text, Text)] -> Map NgramsTerm NgramsRepoElement
261 parseCsvData lst = Map.fromList $ conv <$> lst
262 where
263 conv (status, label, forms) =
264 (NgramsTerm label, NgramsRepoElement { _nre_size = 1
265 , _nre_list = case status == "map" of
266 True -> MapTerm
267 False -> case status == "main" of
268 True -> CandidateTerm
269 False -> StopTerm
270 , _nre_root = Nothing
271 , _nre_parent = Nothing
272 , _nre_children = MSet
273 $ Map.fromList
274 $ map (\form -> (NgramsTerm form, ()))
275 $ filter (/= "")
276 $ splitOn "|&|" forms
277 }
278 )
279
280 csvPost :: FlowCmdM env err m
281 => ListId
282 -> Text
283 -> m Bool
284 csvPost l m = do
285 printDebug "[csvPost] l" l
286 -- printDebug "[csvPost] m" m
287 -- status label forms
288 let lst = readCsvText m
289 let p = parseCsvData lst
290 --printDebug "[csvPost] lst" lst
291 printDebug "[csvPost] p" p
292 _ <- setListNgrams l NgramsTerms p
293 printDebug "ReIndexing List" l
294 corpus_node <- getNode l -- (Proxy :: Proxy HyperdataList)
295 let corpus_id = fromMaybe (panic "") (_node_parent_id corpus_node)
296 _ <- reIndexWith corpus_id l NgramsTerms (Set.fromList [MapTerm, CandidateTerm])
297
298 pure True
299
300 ------------------------------------------------------------------------
301 csvPostAsync :: GargServer CSVAPI
302 csvPostAsync lId =
303 serveJobsAPI $
304 JobFunction $ \f@(WithTextFile ft _ n) log' -> do
305 let log'' x = do
306 printDebug "[csvPostAsync] filetype" ft
307 printDebug "[csvPostAsync] name" n
308 liftBase $ log' x
309 csvPostAsync' lId f log''
310
311
312 csvPostAsync' :: FlowCmdM env err m
313 => ListId
314 -> WithTextFile
315 -> (JobLog -> m ())
316 -> m JobLog
317 csvPostAsync' l (WithTextFile _ m _) logStatus = do
318 logStatus JobLog { _scst_succeeded = Just 0
319 , _scst_failed = Just 0
320 , _scst_remaining = Just 1
321 , _scst_events = Just []
322 }
323 _r <- csvPost l m
324
325 pure JobLog { _scst_succeeded = Just 1
326 , _scst_failed = Just 0
327 , _scst_remaining = Just 0
328 , _scst_events = Just []
329 }
330 ------------------------------------------------------------------------