2 Module : Gargantext.API.Ngrams.List
3 Description : Get Ngrams (lists)
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE TemplateHaskell #-}
13 {-# LANGUAGE TypeOperators #-}
15 module Gargantext.API.Ngrams.List
18 import Control.Lens hiding (elements, Indexed)
19 import Data.Either (Either(..))
20 import Data.HashMap.Strict (HashMap)
21 import Data.Map (Map, toList)
22 import Data.Maybe (catMaybes, fromMaybe)
24 import Data.Text (Text, concat, pack, splitOn)
25 import Data.Vector (Vector)
26 import Gargantext.API.Admin.EnvTypes (Env, GargJob(..))
27 import Gargantext.API.Admin.Orchestrator.Types
28 import Gargantext.API.Ngrams (setListNgrams)
29 import Gargantext.API.Ngrams.List.Types
30 import Gargantext.API.Ngrams.Prelude (getNgramsList)
31 import Gargantext.API.Ngrams.Tools (getTermsWith)
32 import Gargantext.API.Ngrams.Types
33 import Gargantext.API.Prelude (GargServer, GargM, GargError)
34 import Gargantext.API.Types
35 import Gargantext.Core.NodeStory
36 import Gargantext.Core.Text.Terms (ExtractedNgrams(..))
37 import Gargantext.Core.Text.Terms.WithList (buildPatterns, termsInText)
38 import Gargantext.Core.Types.Main (ListType(..))
39 import Gargantext.Database.Action.Flow (saveDocNgramsWith)
40 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
41 import Gargantext.Database.Admin.Types.Hyperdata.Document
42 import Gargantext.Database.Admin.Types.Node
43 import Gargantext.Database.Query.Table.Node (getNode)
44 import Gargantext.Database.Query.Table.NodeContext (selectDocNodes)
45 import Gargantext.Database.Schema.Context
46 import Gargantext.Database.Schema.Ngrams
47 import Gargantext.Database.Schema.Node (_node_parent_id)
48 import Gargantext.Database.Types (Indexed(..))
49 import Gargantext.Prelude
50 import Gargantext.Utils.Jobs (serveJobsAPI)
52 -- import Servant.Job.Async
53 import qualified Data.ByteString.Lazy as BSL
54 import qualified Data.Csv as Csv
55 import qualified Data.HashMap.Strict as HashMap
56 import qualified Data.List as List
57 import qualified Data.Map as Map
58 import qualified Data.Set as Set
59 import qualified Data.Text as Text
60 import qualified Data.Vector as Vec
61 import qualified Prelude
62 import qualified Protolude as P
63 ------------------------------------------------------------------------
64 type GETAPI = Summary "Get List"
66 :> Capture "listId" ListId
67 :> Get '[JSON, HTML] (Headers '[Header "Content-Disposition" Text] NgramsList)
68 getApi :: GargServer GETAPI
71 ----------------------
72 type JSONAPI = Summary "Update List"
74 :> Capture "listId" ListId
78 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
80 jsonApi :: ServerT JSONAPI (GargM Env GargError)
83 ----------------------
84 type CSVAPI = Summary "Update List (legacy v3 CSV)"
86 :> Capture "listId" ListId
91 :> AsyncJobs JobLog '[FormUrlEncoded] WithTextFile JobLog
93 csvApi :: ServerT CSVAPI (GargM Env GargError)
96 ------------------------------------------------------------------------
97 get :: HasNodeStory env err m =>
98 ListId -> m (Headers '[Header "Content-Disposition" Text] NgramsList)
100 lst <- getNgramsList lId
101 let (NodeId id') = lId
102 return $ addHeader (concat [ "attachment; filename=GarganText_NgramsList-"
108 ------------------------------------------------------------------------
111 setList :: FlowCmdM env err m
116 -- TODO check with Version for optim
117 printDebug "New list as file" l
118 _ <- mapM (\(nt, Versioned _v ns) -> setListNgrams l nt ns) $ toList m
122 ------------------------------------------------------------------------
123 -- | Re-index documents of a corpus with new ngrams (called orphans here)
124 reIndexWith :: ( HasNodeStory env err m
132 reIndexWith cId lId nt lts = do
133 printDebug "(cId,lId,nt,lts)" (cId, lId, nt, lts)
135 -- Getting [NgramsTerm]
137 <$> map (\(k,vs) -> k:vs)
139 <$> getTermsWith identity [lId] nt lts
142 let orphans = ts {- List.concat
143 $ map (\t -> case HashMap.lookup t occs of
145 Just n -> if n <= 1 then [t] else [ ]
149 printDebug "orphans" orphans
151 -- Get all documents of the corpus
152 docs <- selectDocNodes cId
153 -- printDebug "docs length" (List.length docs)
155 -- Checking Text documents where orphans match
159 ngramsByDoc = map (HashMap.fromList)
160 $ map (map (\((k, cnt), v) -> (SimpleNgrams (text2ngrams k), over (traverse . traverse) (\p -> (p, cnt)) v)))
161 $ map (\doc -> List.zip
162 (termsInText (buildPatterns $ map (\k -> (Text.splitOn " " $ unNgramsTerm k, [])) orphans)
163 $ Text.unlines $ catMaybes
164 [ doc ^. context_hyperdata . hd_title
165 , doc ^. context_hyperdata . hd_abstract
168 (List.cycle [Map.fromList $ [(nt, Map.singleton (doc ^. context_id) 1 )]])
171 printDebug "ngramsByDoc: " ngramsByDoc
173 -- Saving the indexation in database
174 _ <- mapM (saveDocNgramsWith lId) ngramsByDoc
178 toIndexedNgrams :: HashMap Text NgramsId -> Text -> Maybe (Indexed Int Ngrams)
179 toIndexedNgrams m t = Indexed <$> i <*> n
181 i = HashMap.lookup t m
182 n = Just (text2ngrams t)
184 ------------------------------------------------------------------------
185 type PostAPI = Summary "Update List"
189 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
191 postAsync :: ListId -> ServerT PostAPI (GargM Env GargError)
193 serveJobsAPI UpdateNgramsListJobJSON $ \f log' ->
196 -- printDebug "postAsync ListId" x
198 in postAsync' lId f log''
200 postAsync' :: FlowCmdM env err m
205 postAsync' l (WithFile _ m _) logStatus = do
207 logStatus JobLog { _scst_succeeded = Just 0
208 , _scst_failed = Just 0
209 , _scst_remaining = Just 2
210 , _scst_events = Just []
212 printDebug "New list as file" l
214 -- printDebug "Done" r
216 logStatus JobLog { _scst_succeeded = Just 1
217 , _scst_failed = Just 0
218 , _scst_remaining = Just 1
219 , _scst_events = Just []
223 corpus_node <- getNode l -- (Proxy :: Proxy HyperdataList)
224 let corpus_id = fromMaybe (panic "") (_node_parent_id corpus_node)
225 _ <- reIndexWith corpus_id l NgramsTerms (Set.fromList [MapTerm, CandidateTerm])
227 pure JobLog { _scst_succeeded = Just 2
228 , _scst_failed = Just 0
229 , _scst_remaining = Just 0
230 , _scst_events = Just []
234 ------------------------------------------------------------------------
235 type CSVPostAPI = Summary "Update List (legacy v3 CSV)"
240 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
242 readCsvText :: Text -> [(Text, Text, Text)]
243 readCsvText t = case eDec of
245 Right dec -> Vec.toList dec
247 lt = BSL.fromStrict $ P.encodeUtf8 t
248 eDec = Csv.decodeWith
249 (Csv.defaultDecodeOptions { Csv.decDelimiter = fromIntegral (P.ord '\t') })
250 Csv.HasHeader lt :: Either Prelude.String (Vector (Text, Text, Text))
252 parseCsvData :: [(Text, Text, Text)] -> Map NgramsTerm NgramsRepoElement
253 parseCsvData lst = Map.fromList $ conv <$> lst
255 conv (status, label, forms) =
256 (NgramsTerm label, NgramsRepoElement { _nre_size = 1
257 , _nre_list = case status == "map" of
259 False -> case status == "main" of
260 True -> CandidateTerm
262 , _nre_root = Nothing
263 , _nre_parent = Nothing
264 , _nre_children = MSet
266 $ map (\form -> (NgramsTerm form, ()))
267 $ filter (\w -> w /= "" && w /= label)
268 $ splitOn "|&|" forms
272 csvPost :: FlowCmdM env err m
277 printDebug "[csvPost] l" l
278 -- printDebug "[csvPost] m" m
279 -- status label forms
280 let lst = readCsvText m
281 let p = parseCsvData lst
282 --printDebug "[csvPost] lst" lst
283 printDebug "[csvPost] p" p
284 _ <- setListNgrams l NgramsTerms p
285 printDebug "ReIndexing List" l
286 corpus_node <- getNode l -- (Proxy :: Proxy HyperdataList)
287 let corpus_id = fromMaybe (panic "") (_node_parent_id corpus_node)
288 _ <- reIndexWith corpus_id l NgramsTerms (Set.fromList [MapTerm, CandidateTerm])
292 ------------------------------------------------------------------------
293 csvPostAsync :: ServerT CSVAPI (GargM Env GargError)
295 serveJobsAPI UpdateNgramsListJobCSV $ \f@(WithTextFile ft _ n) log' -> do
297 printDebug "[csvPostAsync] filetype" ft
298 printDebug "[csvPostAsync] name" n
300 csvPostAsync' lId f log''
303 csvPostAsync' :: FlowCmdM env err m
308 csvPostAsync' l (WithTextFile _ m _) logStatus = do
309 logStatus JobLog { _scst_succeeded = Just 0
310 , _scst_failed = Just 0
311 , _scst_remaining = Just 1
312 , _scst_events = Just []
316 pure JobLog { _scst_succeeded = Just 1
317 , _scst_failed = Just 0
318 , _scst_remaining = Just 0
319 , _scst_events = Just []
321 ------------------------------------------------------------------------