2 Module : Gargantext.API.Ngrams.List
3 Description : Get Ngrams (lists)
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE TemplateHaskell #-}
13 {-# LANGUAGE TypeOperators #-}
15 module Gargantext.API.Ngrams.List
18 import Control.Lens hiding (elements, Indexed)
19 import Data.Either (Either(..))
20 import Data.HashMap.Strict (HashMap)
21 import Data.Map (Map, toList)
22 import Data.Maybe (catMaybes, fromMaybe)
24 import Data.Text (Text, concat, pack, splitOn)
25 import Data.Vector (Vector)
26 import Gargantext.API.Admin.EnvTypes (Env, GargJob(..))
27 import Gargantext.API.Admin.Orchestrator.Types
28 import Gargantext.API.Ngrams (setListNgrams)
29 import Gargantext.API.Ngrams.List.Types
30 import Gargantext.API.Ngrams.Prelude (getNgramsList)
31 import Gargantext.API.Ngrams.Tools (getTermsWith)
32 import Gargantext.API.Ngrams.Types
33 import Gargantext.API.Prelude (GargServer, GargM, GargError)
34 import Gargantext.API.Types
35 import Gargantext.Core.NodeStory
36 import Gargantext.Core.Text.Terms (ExtractedNgrams(..))
37 import Gargantext.Core.Text.Terms.WithList (buildPatterns, termsInText)
38 import Gargantext.Core.Types.Main (ListType(..))
39 import Gargantext.Database.Action.Flow (saveDocNgramsWith)
40 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
41 import Gargantext.Database.Admin.Types.Hyperdata.Document
42 import Gargantext.Database.Admin.Types.Node
43 import Gargantext.Database.Query.Table.Node (getNode)
44 import Gargantext.Database.Query.Table.NodeContext (selectDocNodes)
45 import Gargantext.Database.Schema.Context
46 import Gargantext.Database.Schema.Ngrams
47 import Gargantext.Database.Schema.Node (_node_parent_id)
48 import Gargantext.Database.Types (Indexed(..))
49 import Gargantext.Prelude
50 import Gargantext.Utils.Jobs (serveJobsAPI)
52 -- import Servant.Job.Async
53 import qualified Data.ByteString.Lazy as BSL
54 import qualified Data.Csv as Csv
55 import qualified Data.HashMap.Strict as HashMap
56 import qualified Data.List as List
57 import qualified Data.Map as Map
58 import qualified Data.Set as Set
59 import qualified Data.Text as Text
60 import qualified Data.Vector as Vec
61 import qualified Gargantext.Database.Query.Table.Ngrams as TableNgrams
62 import qualified Gargantext.Utils.Servant as GUS
63 import qualified Prelude
64 import qualified Protolude as P
65 ------------------------------------------------------------------------
66 type GETAPI = Summary "Get List"
68 :> Capture "listId" ListId
70 :> Get '[JSON, HTML] (Headers '[Header "Content-Disposition" Text] NgramsList)
72 :> Capture "listId" ListId
74 :> Get '[GUS.CSV] (Headers '[Header "Content-Disposition" Text] NgramsTableMap)
75 getApi :: GargServer GETAPI
76 getApi = getJson :<|> getCsv
78 ----------------------
79 type JSONAPI = Summary "Update List"
81 :> Capture "listId" ListId
85 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
87 jsonApi :: ServerT JSONAPI (GargM Env GargError)
90 ----------------------
91 type CSVAPI = Summary "Update List (legacy v3 CSV)"
93 :> Capture "listId" ListId
98 :> AsyncJobs JobLog '[FormUrlEncoded] WithTextFile JobLog
100 csvApi :: ServerT CSVAPI (GargM Env GargError)
101 csvApi = csvPostAsync
103 ------------------------------------------------------------------------
104 getJson :: HasNodeStory env err m =>
105 ListId -> m (Headers '[Header "Content-Disposition" Text] NgramsList)
107 lst <- getNgramsList lId
108 let (NodeId id') = lId
109 return $ addHeader (concat [ "attachment; filename=GarganText_NgramsList-"
115 getCsv :: HasNodeStory env err m =>
116 ListId -> m (Headers '[Header "Content-Disposition" Text] NgramsTableMap)
118 lst <- getNgramsList lId
119 let (NodeId id') = lId
120 return $ case Map.lookup TableNgrams.NgramsTerms lst of
121 Nothing -> noHeader Map.empty
122 Just (Versioned { _v_data }) ->
123 addHeader (concat [ "attachment; filename=GarganText_NgramsList-"
129 ------------------------------------------------------------------------
132 setList :: FlowCmdM env err m
137 -- TODO check with Version for optim
138 printDebug "New list as file" l
139 _ <- mapM (\(nt, Versioned _v ns) -> setListNgrams l nt ns) $ toList m
143 ------------------------------------------------------------------------
144 -- | Re-index documents of a corpus with new ngrams (called orphans here)
145 reIndexWith :: ( HasNodeStory env err m
153 reIndexWith cId lId nt lts = do
154 printDebug "(cId,lId,nt,lts)" (cId, lId, nt, lts)
156 -- Getting [NgramsTerm]
158 <$> map (\(k,vs) -> k:vs)
160 <$> getTermsWith identity [lId] nt lts
163 let orphans = ts {- List.concat
164 $ map (\t -> case HashMap.lookup t occs of
166 Just n -> if n <= 1 then [t] else [ ]
170 printDebug "orphans" orphans
172 -- Get all documents of the corpus
173 docs <- selectDocNodes cId
174 -- printDebug "docs length" (List.length docs)
176 -- Checking Text documents where orphans match
180 ngramsByDoc = map (HashMap.fromList)
181 $ map (map (\((k, cnt), v) -> (SimpleNgrams (text2ngrams k), over (traverse . traverse) (\p -> (p, cnt)) v)))
182 $ map (\doc -> List.zip
183 (termsInText (buildPatterns $ map (\k -> (Text.splitOn " " $ unNgramsTerm k, [])) orphans)
184 $ Text.unlines $ catMaybes
185 [ doc ^. context_hyperdata . hd_title
186 , doc ^. context_hyperdata . hd_abstract
189 (List.cycle [Map.fromList $ [(nt, Map.singleton (doc ^. context_id) 1 )]])
192 printDebug "ngramsByDoc: " ngramsByDoc
194 -- Saving the indexation in database
195 _ <- mapM (saveDocNgramsWith lId) ngramsByDoc
199 toIndexedNgrams :: HashMap Text NgramsId -> Text -> Maybe (Indexed Int Ngrams)
200 toIndexedNgrams m t = Indexed <$> i <*> n
202 i = HashMap.lookup t m
203 n = Just (text2ngrams t)
205 ------------------------------------------------------------------------
206 type PostAPI = Summary "Update List"
210 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
212 postAsync :: ListId -> ServerT PostAPI (GargM Env GargError)
214 serveJobsAPI UpdateNgramsListJobJSON $ \f log' ->
217 -- printDebug "postAsync ListId" x
219 in postAsync' lId f log''
221 postAsync' :: FlowCmdM env err m
226 postAsync' l (WithFile _ m _) logStatus = do
228 logStatus JobLog { _scst_succeeded = Just 0
229 , _scst_failed = Just 0
230 , _scst_remaining = Just 2
231 , _scst_events = Just []
233 printDebug "New list as file" l
235 -- printDebug "Done" r
237 logStatus JobLog { _scst_succeeded = Just 1
238 , _scst_failed = Just 0
239 , _scst_remaining = Just 1
240 , _scst_events = Just []
244 corpus_node <- getNode l -- (Proxy :: Proxy HyperdataList)
245 let corpus_id = fromMaybe (panic "") (_node_parent_id corpus_node)
246 _ <- reIndexWith corpus_id l NgramsTerms (Set.fromList [MapTerm, CandidateTerm])
248 pure JobLog { _scst_succeeded = Just 2
249 , _scst_failed = Just 0
250 , _scst_remaining = Just 0
251 , _scst_events = Just []
255 ------------------------------------------------------------------------
256 type CSVPostAPI = Summary "Update List (legacy v3 CSV)"
261 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
263 readCsvText :: Text -> [(Text, Text, Text)]
264 readCsvText t = case eDec of
266 Right dec -> Vec.toList dec
268 lt = BSL.fromStrict $ P.encodeUtf8 t
269 eDec = Csv.decodeWith
270 (Csv.defaultDecodeOptions { Csv.decDelimiter = fromIntegral (P.ord '\t') })
271 Csv.HasHeader lt :: Either Prelude.String (Vector (Text, Text, Text))
273 parseCsvData :: [(Text, Text, Text)] -> Map NgramsTerm NgramsRepoElement
274 parseCsvData lst = Map.fromList $ conv <$> lst
276 conv (status, label, forms) =
277 (NgramsTerm label, NgramsRepoElement { _nre_size = 1
278 , _nre_list = case status == "map" of
280 False -> case status == "main" of
281 True -> CandidateTerm
283 , _nre_root = Nothing
284 , _nre_parent = Nothing
285 , _nre_children = MSet
287 $ map (\form -> (NgramsTerm form, ()))
288 $ filter (\w -> w /= "" && w /= label)
289 $ splitOn "|&|" forms
293 csvPost :: FlowCmdM env err m
298 printDebug "[csvPost] l" l
299 -- printDebug "[csvPost] m" m
300 -- status label forms
301 let lst = readCsvText m
302 let p = parseCsvData lst
303 --printDebug "[csvPost] lst" lst
304 printDebug "[csvPost] p" p
305 _ <- setListNgrams l NgramsTerms p
306 printDebug "ReIndexing List" l
307 corpus_node <- getNode l -- (Proxy :: Proxy HyperdataList)
308 let corpus_id = fromMaybe (panic "") (_node_parent_id corpus_node)
309 _ <- reIndexWith corpus_id l NgramsTerms (Set.fromList [MapTerm, CandidateTerm])
313 ------------------------------------------------------------------------
314 csvPostAsync :: ServerT CSVAPI (GargM Env GargError)
316 serveJobsAPI UpdateNgramsListJobCSV $ \f@(WithTextFile ft _ n) log' -> do
318 printDebug "[csvPostAsync] filetype" ft
319 printDebug "[csvPostAsync] name" n
321 csvPostAsync' lId f log''
324 csvPostAsync' :: FlowCmdM env err m
329 csvPostAsync' l (WithTextFile _ m _) logStatus = do
330 logStatus JobLog { _scst_succeeded = Just 0
331 , _scst_failed = Just 0
332 , _scst_remaining = Just 1
333 , _scst_events = Just []
337 pure JobLog { _scst_succeeded = Just 1
338 , _scst_failed = Just 0
339 , _scst_remaining = Just 0
340 , _scst_events = Just []
342 ------------------------------------------------------------------------