2 Module : Gargantext.API.Ngrams.List
3 Description : Get Ngrams (lists)
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE TemplateHaskell #-}
13 {-# LANGUAGE TypeOperators #-}
15 module Gargantext.API.Ngrams.List
18 import Control.Lens hiding (elements, Indexed)
19 import Data.Either (Either(..))
20 import Data.HashMap.Strict (HashMap)
21 import Data.Map.Strict (Map, toList)
22 import Data.Maybe (catMaybes, fromMaybe)
24 import Data.Text (Text, concat, pack, splitOn)
25 import Data.Vector (Vector)
26 import Gargantext.API.Admin.EnvTypes (Env, GargJob(..))
27 import Gargantext.API.Admin.Orchestrator.Types
28 import Gargantext.API.Ngrams (setListNgrams)
29 import Gargantext.API.Ngrams.List.Types
30 import Gargantext.API.Ngrams.Prelude (getNgramsList)
31 import Gargantext.API.Ngrams.Tools (getTermsWith)
32 import Gargantext.API.Ngrams.Types
33 import Gargantext.API.Prelude (GargServer, GargM, GargError)
34 import Gargantext.API.Types
35 import Gargantext.Core.NodeStory
36 import Gargantext.Core.Text.Terms (ExtractedNgrams(..))
37 import Gargantext.Core.Text.Terms.WithList (buildPatterns, termsInText)
38 import Gargantext.Core.Types.Main (ListType(..))
39 import Gargantext.Database.Action.Flow (saveDocNgramsWith)
40 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
41 import Gargantext.Database.Admin.Types.Hyperdata.Document
42 import Gargantext.Database.Admin.Types.Node
43 import Gargantext.Database.Query.Table.Node (getNode)
44 import Gargantext.Database.Query.Table.NodeContext (selectDocNodes)
45 import Gargantext.Database.Schema.Context
46 import Gargantext.Database.Schema.Ngrams
47 import Gargantext.Database.Schema.Node (_node_parent_id)
48 import Gargantext.Database.Types (Indexed(..))
49 import Gargantext.Prelude
50 import Gargantext.Utils.Jobs (serveJobsAPI)
52 -- import Servant.Job.Async
53 import qualified Data.ByteString.Lazy as BSL
54 import qualified Data.Csv as Csv
55 import qualified Data.HashMap.Strict as HashMap
56 import qualified Data.List as List
57 import qualified Data.Map.Strict as Map
58 import qualified Data.Set as Set
59 import qualified Data.Text as Text
60 import qualified Data.Vector as Vec
61 import qualified Gargantext.Database.Query.Table.Ngrams as TableNgrams
62 import qualified Gargantext.Utils.Servant as GUS
63 import qualified Prelude
64 import qualified Protolude as P
65 ------------------------------------------------------------------------
66 type GETAPI = Summary "Get List"
68 :> Capture "listId" ListId
70 :> Get '[JSON, HTML] (Headers '[Header "Content-Disposition" Text] NgramsList)
72 :> Capture "listId" ListId
74 :> Get '[GUS.CSV] (Headers '[Header "Content-Disposition" Text] NgramsTableMap)
75 getApi :: GargServer GETAPI
76 getApi = getJson :<|> getCsv
78 ----------------------
79 type JSONAPI = Summary "Update List"
81 :> Capture "listId" ListId
85 :> AsyncJobs JobLog '[FormUrlEncoded] WithJsonFile JobLog
87 jsonApi :: ServerT JSONAPI (GargM Env GargError)
88 jsonApi = jsonPostAsync
90 ----------------------
91 type CSVAPI = Summary "Update List (legacy v3 CSV)"
93 :> Capture "listId" ListId
98 :> AsyncJobs JobLog '[FormUrlEncoded] WithTextFile JobLog
100 csvApi :: ServerT CSVAPI (GargM Env GargError)
101 csvApi = csvPostAsync
103 ------------------------------------------------------------------------
104 getJson :: HasNodeStory env err m =>
105 ListId -> m (Headers '[Header "Content-Disposition" Text] NgramsList)
107 lst <- getNgramsList lId
108 let (NodeId id') = lId
109 return $ addHeader (concat [ "attachment; filename=GarganText_NgramsList-"
115 getCsv :: HasNodeStory env err m =>
116 ListId -> m (Headers '[Header "Content-Disposition" Text] NgramsTableMap)
118 lst <- getNgramsList lId
119 let (NodeId id') = lId
120 return $ case Map.lookup TableNgrams.NgramsTerms lst of
121 Nothing -> noHeader Map.empty
122 Just (Versioned { _v_data }) ->
123 addHeader (concat [ "attachment; filename=GarganText_NgramsList-"
129 ------------------------------------------------------------------------
132 setList :: FlowCmdM env err m
137 -- TODO check with Version for optim
138 -- printDebug "New list as file" l
139 _ <- mapM (\(nt, Versioned _v ns) -> setListNgrams l nt ns) $ toList m
143 ------------------------------------------------------------------------
144 -- | Re-index documents of a corpus with new ngrams (called orphans here)
145 reIndexWith :: ( HasNodeStory env err m
153 reIndexWith cId lId nt lts = do
154 -- printDebug "(cId,lId,nt,lts)" (cId, lId, nt, lts)
156 -- Getting [NgramsTerm]
158 <$> map (\(k,vs) -> k:vs)
160 <$> getTermsWith identity [lId] nt lts
162 -- Get all documents of the corpus
163 docs <- selectDocNodes cId
167 ngramsByDoc = map (HashMap.fromListWith (Map.unionWith (Map.unionWith (\(_a,b) (_a',b') -> (1,b+b')))))
168 $ map (map (\((k, cnt), v) -> (SimpleNgrams (text2ngrams k), over (traverse . traverse) (\p -> (p, cnt)) v)))
169 $ map (\doc -> List.zip
170 (termsInText (buildPatterns $ map (\k -> (Text.splitOn " " $ unNgramsTerm k, [])) ts)
171 $ Text.unlines $ catMaybes
172 [ doc ^. context_hyperdata . hd_title
173 , doc ^. context_hyperdata . hd_abstract
176 (List.cycle [Map.fromList $ [(nt, Map.singleton (doc ^. context_id) 1 )]])
179 -- printDebug "ngramsByDoc: " ngramsByDoc
181 -- Saving the indexation in database
182 _ <- mapM (saveDocNgramsWith lId) ngramsByDoc
186 toIndexedNgrams :: HashMap Text NgramsId -> Text -> Maybe (Indexed Int Ngrams)
187 toIndexedNgrams m t = Indexed <$> i <*> n
189 i = HashMap.lookup t m
190 n = Just (text2ngrams t)
192 ------------------------------------------------------------------------
193 jsonPostAsync :: ServerT JSONAPI (GargM Env GargError)
195 serveJobsAPI UpdateNgramsListJobJSON $ \_jHandle f log' ->
198 -- printDebug "postAsync ListId" x
200 in postAsync' lId f log''
202 postAsync' :: FlowCmdM env err m
207 postAsync' l (WithJsonFile m _) logStatus = do
209 logStatus JobLog { _scst_succeeded = Just 0
210 , _scst_failed = Just 0
211 , _scst_remaining = Just 2
212 , _scst_events = Just []
214 -- printDebug "New list as file" l
216 -- printDebug "Done" r
218 logStatus JobLog { _scst_succeeded = Just 1
219 , _scst_failed = Just 0
220 , _scst_remaining = Just 1
221 , _scst_events = Just []
225 corpus_node <- getNode l -- (Proxy :: Proxy HyperdataList)
226 let corpus_id = fromMaybe (panic "") (_node_parent_id corpus_node)
227 _ <- reIndexWith corpus_id l NgramsTerms (Set.fromList [MapTerm, CandidateTerm])
229 pure JobLog { _scst_succeeded = Just 2
230 , _scst_failed = Just 0
231 , _scst_remaining = Just 0
232 , _scst_events = Just []
236 ------------------------------------------------------------------------
238 readCsvText :: Text -> [(Text, Text, Text)]
239 readCsvText t = case eDec of
241 Right dec -> Vec.toList dec
243 lt = BSL.fromStrict $ P.encodeUtf8 t
244 eDec = Csv.decodeWith
245 (Csv.defaultDecodeOptions { Csv.decDelimiter = fromIntegral (P.ord '\t') })
246 Csv.HasHeader lt :: Either Prelude.String (Vector (Text, Text, Text))
248 parseCsvData :: [(Text, Text, Text)] -> Map NgramsTerm NgramsRepoElement
249 parseCsvData lst = Map.fromList $ conv <$> lst
251 conv (status, label, forms) =
252 (NgramsTerm label, NgramsRepoElement { _nre_size = 1
253 , _nre_list = case status == "map" of
255 False -> case status == "main" of
256 True -> CandidateTerm
258 , _nre_root = Nothing
259 , _nre_parent = Nothing
260 , _nre_children = MSet
262 $ map (\form -> (NgramsTerm form, ()))
263 $ filter (\w -> w /= "" && w /= label)
264 $ splitOn "|&|" forms
268 csvPost :: FlowCmdM env err m
273 -- printDebug "[csvPost] l" l
274 -- printDebug "[csvPost] m" m
275 -- status label forms
276 let lst = readCsvText m
277 let p = parseCsvData lst
278 --printDebug "[csvPost] lst" lst
279 -- printDebug "[csvPost] p" p
280 _ <- setListNgrams l NgramsTerms p
281 -- printDebug "ReIndexing List" l
282 corpus_node <- getNode l -- (Proxy :: Proxy HyperdataList)
283 let corpus_id = fromMaybe (panic "") (_node_parent_id corpus_node)
284 _ <- reIndexWith corpus_id l NgramsTerms (Set.fromList [MapTerm, CandidateTerm])
288 ------------------------------------------------------------------------
289 csvPostAsync :: ServerT CSVAPI (GargM Env GargError)
291 serveJobsAPI UpdateNgramsListJobCSV $ \_jHandle f@(WithTextFile _ft _ _n) log' -> do
293 -- printDebug "[csvPostAsync] filetype" ft
294 -- printDebug "[csvPostAsync] name" n
296 csvPostAsync' lId f log''
299 csvPostAsync' :: FlowCmdM env err m
304 csvPostAsync' l (WithTextFile _ m _) logStatus = do
305 logStatus JobLog { _scst_succeeded = Just 0
306 , _scst_failed = Just 0
307 , _scst_remaining = Just 1
308 , _scst_events = Just []
312 pure JobLog { _scst_succeeded = Just 1
313 , _scst_failed = Just 0
314 , _scst_remaining = Just 0
315 , _scst_events = Just []
317 ------------------------------------------------------------------------