2 Module : Gargantext.API.Ngrams.List
3 Description : Get Ngrams (lists)
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE TemplateHaskell #-}
13 {-# LANGUAGE TypeOperators #-}
15 module Gargantext.API.Ngrams.List
18 import Control.Lens hiding (elements, Indexed)
19 import Data.Either (Either(..))
20 import Data.HashMap.Strict (HashMap)
21 import Data.Map.Strict (Map, toList)
22 import Data.Maybe (catMaybes, fromMaybe)
24 import Data.Text (Text, concat, pack, splitOn)
25 import Data.Vector (Vector)
26 import Gargantext.API.Admin.EnvTypes (Env, GargJob(..))
27 import Gargantext.API.Admin.Orchestrator.Types
28 import Gargantext.API.Job (jobLogFailTotalWithMessage, jobLogSuccess)
29 import Gargantext.API.Ngrams (setListNgrams)
30 import Gargantext.API.Ngrams.List.Types
31 import Gargantext.API.Ngrams.Prelude (getNgramsList)
32 import Gargantext.API.Ngrams.Tools (getTermsWith)
33 import Gargantext.API.Ngrams.Types
34 import Gargantext.API.Prelude (GargServer, GargM, GargError)
35 import Gargantext.API.Types
36 import Gargantext.Core.NodeStory
37 import Gargantext.Core.Text.Terms (ExtractedNgrams(..))
38 import Gargantext.Core.Text.Terms.WithList (buildPatterns, termsInText)
39 import Gargantext.Core.Types.Main (ListType(..))
40 import Gargantext.Database.Action.Flow (saveDocNgramsWith)
41 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
42 import Gargantext.Database.Admin.Types.Hyperdata.Document
43 import Gargantext.Database.Admin.Types.Node
44 import Gargantext.Database.Query.Table.Node (getNode)
45 import Gargantext.Database.Query.Table.NodeContext (selectDocNodes)
46 import Gargantext.Database.Schema.Context
47 import Gargantext.Database.Schema.Ngrams
48 import Gargantext.Database.Schema.Node (_node_parent_id)
49 import Gargantext.Database.Types (Indexed(..))
50 import Gargantext.Prelude
51 import Gargantext.Utils.Jobs (serveJobsAPI, jobHandleLogger)
53 -- import Servant.Job.Async
54 import qualified Data.ByteString.Lazy as BSL
55 import qualified Data.Csv as Csv
56 import qualified Data.HashMap.Strict as HashMap
57 import qualified Data.List as List
58 import qualified Data.Map.Strict as Map
59 import qualified Data.Set as Set
60 import qualified Data.Text as Text
61 import qualified Data.Vector as Vec
62 import qualified Gargantext.Database.Query.Table.Ngrams as TableNgrams
63 import qualified Gargantext.Utils.Servant as GUS
64 import qualified Prelude
65 import qualified Protolude as P
66 ------------------------------------------------------------------------
67 type GETAPI = Summary "Get List"
69 :> Capture "listId" ListId
71 :> Get '[JSON, HTML] (Headers '[Header "Content-Disposition" Text] NgramsList)
73 :> Capture "listId" ListId
75 :> Get '[GUS.CSV] (Headers '[Header "Content-Disposition" Text] NgramsTableMap)
76 getApi :: GargServer GETAPI
77 getApi = getJson :<|> getCsv
79 ----------------------
80 type JSONAPI = Summary "Update List"
82 :> Capture "listId" ListId
86 :> AsyncJobs JobLog '[FormUrlEncoded] WithJsonFile JobLog
88 jsonApi :: ServerT JSONAPI (GargM Env GargError)
89 jsonApi = jsonPostAsync
91 ----------------------
92 type CSVAPI = Summary "Update List (legacy v3 CSV)"
94 :> Capture "listId" ListId
99 :> AsyncJobs JobLog '[FormUrlEncoded] WithTextFile JobLog
101 csvApi :: ServerT CSVAPI (GargM Env GargError)
102 csvApi = csvPostAsync
104 ------------------------------------------------------------------------
105 getJson :: HasNodeStory env err m =>
106 ListId -> m (Headers '[Header "Content-Disposition" Text] NgramsList)
108 lst <- getNgramsList lId
109 let (NodeId id') = lId
110 return $ addHeader (concat [ "attachment; filename=GarganText_NgramsList-"
116 getCsv :: HasNodeStory env err m =>
117 ListId -> m (Headers '[Header "Content-Disposition" Text] NgramsTableMap)
119 lst <- getNgramsList lId
120 let (NodeId id') = lId
121 return $ case Map.lookup TableNgrams.NgramsTerms lst of
122 Nothing -> noHeader Map.empty
123 Just (Versioned { _v_data }) ->
124 addHeader (concat [ "attachment; filename=GarganText_NgramsList-"
130 ------------------------------------------------------------------------
133 setList :: FlowCmdM env err m
138 -- TODO check with Version for optim
139 -- printDebug "New list as file" l
140 _ <- mapM (\(nt, Versioned _v ns) -> setListNgrams l nt ns) $ toList m
144 ------------------------------------------------------------------------
145 -- | Re-index documents of a corpus with new ngrams (called orphans here)
146 reIndexWith :: ( HasNodeStory env err m
154 reIndexWith cId lId nt lts = do
155 -- printDebug "(cId,lId,nt,lts)" (cId, lId, nt, lts)
157 -- Getting [NgramsTerm]
159 <$> map (\(k,vs) -> k:vs)
161 <$> getTermsWith identity [lId] nt lts
163 -- Get all documents of the corpus
164 docs <- selectDocNodes cId
168 ngramsByDoc = map (HashMap.fromListWith (Map.unionWith (Map.unionWith (\(_a,b) (_a',b') -> (1,b+b')))))
169 $ map (map (\((k, cnt), v) -> (SimpleNgrams (text2ngrams k), over (traverse . traverse) (\p -> (p, cnt)) v)))
170 $ map (\doc -> List.zip
171 (termsInText (buildPatterns $ map (\k -> (Text.splitOn " " $ unNgramsTerm k, [])) ts)
172 $ Text.unlines $ catMaybes
173 [ doc ^. context_hyperdata . hd_title
174 , doc ^. context_hyperdata . hd_abstract
177 (List.cycle [Map.fromList $ [(nt, Map.singleton (doc ^. context_id) 1 )]])
180 -- printDebug "ngramsByDoc: " ngramsByDoc
182 -- Saving the indexation in database
183 _ <- mapM (saveDocNgramsWith lId) ngramsByDoc
187 toIndexedNgrams :: HashMap Text NgramsId -> Text -> Maybe (Indexed Int Ngrams)
188 toIndexedNgrams m t = Indexed <$> i <*> n
190 i = HashMap.lookup t m
191 n = Just (text2ngrams t)
193 ------------------------------------------------------------------------
194 jsonPostAsync :: ServerT JSONAPI (GargM Env GargError)
196 serveJobsAPI UpdateNgramsListJobJSON $ \jHandle f ->
199 -- printDebug "postAsync ListId" x
200 jobHandleLogger jHandle x
201 in postAsync' lId f log''
203 postAsync' :: FlowCmdM env err m
208 postAsync' l (WithJsonFile m _) logStatus = do
210 logStatus JobLog { _scst_succeeded = Just 0
211 , _scst_failed = Just 0
212 , _scst_remaining = Just 2
213 , _scst_events = Just []
215 -- printDebug "New list as file" l
217 -- printDebug "Done" r
219 logStatus JobLog { _scst_succeeded = Just 1
220 , _scst_failed = Just 0
221 , _scst_remaining = Just 1
222 , _scst_events = Just []
226 corpus_node <- getNode l -- (Proxy :: Proxy HyperdataList)
227 let corpus_id = fromMaybe (panic "") (_node_parent_id corpus_node)
228 _ <- reIndexWith corpus_id l NgramsTerms (Set.fromList [MapTerm, CandidateTerm])
230 pure JobLog { _scst_succeeded = Just 2
231 , _scst_failed = Just 0
232 , _scst_remaining = Just 0
233 , _scst_events = Just []
237 ------------------------------------------------------------------------
239 readCsvText :: Text -> Either Text [(Text, Text, Text)]
240 readCsvText t = case eDec of
241 Left err -> Left $ pack err
242 Right dec -> Right $ Vec.toList dec
244 lt = BSL.fromStrict $ P.encodeUtf8 t
245 eDec = Csv.decodeWith
246 (Csv.defaultDecodeOptions { Csv.decDelimiter = fromIntegral (P.ord '\t') })
247 Csv.HasHeader lt :: Either Prelude.String (Vector (Text, Text, Text))
249 parseCsvData :: [(Text, Text, Text)] -> Map NgramsTerm NgramsRepoElement
250 parseCsvData lst = Map.fromList $ conv <$> lst
252 conv (status, label, forms) =
253 (NgramsTerm label, NgramsRepoElement { _nre_size = 1
254 , _nre_list = case status == "map" of
256 False -> case status == "main" of
257 True -> CandidateTerm
259 , _nre_root = Nothing
260 , _nre_parent = Nothing
261 , _nre_children = MSet
263 $ map (\form -> (NgramsTerm form, ()))
264 $ filter (\w -> w /= "" && w /= label)
265 $ splitOn "|&|" forms
269 csvPost :: FlowCmdM env err m
272 -> m (Either Text ())
274 -- printDebug "[csvPost] l" l
275 -- printDebug "[csvPost] m" m
276 -- status label forms
277 let eLst = readCsvText m
279 Left err -> pure $ Left err
281 let p = parseCsvData lst
282 --printDebug "[csvPost] lst" lst
283 -- printDebug "[csvPost] p" p
284 _ <- setListNgrams l NgramsTerms p
285 -- printDebug "ReIndexing List" l
286 corpus_node <- getNode l -- (Proxy :: Proxy HyperdataList)
287 let corpus_id = fromMaybe (panic "") (_node_parent_id corpus_node)
288 _ <- reIndexWith corpus_id l NgramsTerms (Set.fromList [MapTerm, CandidateTerm])
292 ------------------------------------------------------------------------
293 csvPostAsync :: ServerT CSVAPI (GargM Env GargError)
295 serveJobsAPI UpdateNgramsListJobCSV $ \jHandle f@(WithTextFile _ft _ _n) -> do
297 -- printDebug "[csvPostAsync] filetype" ft
298 -- printDebug "[csvPostAsync] name" n
299 jobHandleLogger jHandle x
300 jl <- csvPostAsync' lId f log''
301 printDebug "[csvPostAsync] job ended with joblog: " jl
306 csvPostAsync' :: FlowCmdM env err m
311 csvPostAsync' l (WithTextFile _ m _) logStatus = do
312 let jl = JobLog { _scst_succeeded = Just 0
313 , _scst_failed = Just 0
314 , _scst_remaining = Just 1
315 , _scst_events = Just []
320 Left err -> pure $ jobLogFailTotalWithMessage err jl
321 Right () -> pure $ jobLogSuccess jl
322 ------------------------------------------------------------------------
325 -- | This is for debugging the CSV parser in the REPL
326 importCsvFile :: FlowCmdM env err m
327 => ListId -> P.FilePath -> m (Either Text ())
328 importCsvFile lId fp = do
329 contents <- liftBase $ P.readFile fp