2 Module : Gargantext.API.Ngrams.List
3 Description : Get Ngrams (lists)
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE TemplateHaskell #-}
13 {-# LANGUAGE TypeOperators #-}
15 module Gargantext.API.Ngrams.List
18 import Control.Lens hiding (elements, Indexed)
19 import Data.Either (Either(..))
20 import Data.HashMap.Strict (HashMap)
21 import Data.Map (Map, toList)
22 import Data.Maybe (catMaybes, fromMaybe)
24 import Data.Text (Text, concat, pack, splitOn)
25 import Data.Vector (Vector)
26 import Gargantext.API.Admin.EnvTypes (Env, GargJob(..))
27 import Gargantext.API.Admin.Orchestrator.Types
28 import Gargantext.API.Ngrams (setListNgrams)
29 import Gargantext.API.Ngrams.List.Types
30 import Gargantext.API.Ngrams.Prelude (getNgramsList)
31 import Gargantext.API.Ngrams.Tools (getTermsWith)
32 import Gargantext.API.Ngrams.Types
33 import Gargantext.API.Prelude (GargServer, GargM, GargError)
34 import Gargantext.API.Types
35 import Gargantext.Core.NodeStory
36 import Gargantext.Core.Text.Terms (ExtractedNgrams(..))
37 import Gargantext.Core.Text.Terms.WithList (buildPatterns, termsInText)
38 import Gargantext.Core.Types.Main (ListType(..))
39 import Gargantext.Database.Action.Flow (saveDocNgramsWith)
40 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
41 import Gargantext.Database.Admin.Types.Hyperdata.Document
42 import Gargantext.Database.Admin.Types.Node
43 import Gargantext.Database.Query.Table.Node (getNode)
44 import Gargantext.Database.Query.Table.NodeContext (selectDocNodes)
45 import Gargantext.Database.Schema.Context
46 import Gargantext.Database.Schema.Ngrams
47 import Gargantext.Database.Schema.Node (_node_parent_id)
48 import Gargantext.Database.Types (Indexed(..))
49 import Gargantext.Prelude
50 import Gargantext.Utils.Jobs (serveJobsAPI)
52 -- import Servant.Job.Async
53 import qualified Data.ByteString.Lazy as BSL
54 import qualified Data.Csv as Csv
55 import qualified Data.HashMap.Strict as HashMap
56 import qualified Data.List as List
57 import qualified Data.Map as Map
58 import qualified Data.Set as Set
59 import qualified Data.Text as Text
60 import qualified Data.Vector as Vec
61 import qualified Prelude
62 import qualified Protolude as P
63 ------------------------------------------------------------------------
64 type GETAPI = Summary "Get List"
66 :> Capture "listId" ListId
67 :> Get '[JSON, HTML] (Headers '[Header "Content-Disposition" Text] NgramsList)
68 getApi :: GargServer GETAPI
71 ----------------------
72 type JSONAPI = Summary "Update List"
74 :> Capture "listId" ListId
78 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
80 jsonApi :: ServerT JSONAPI (GargM Env GargError)
83 ----------------------
84 type CSVAPI = Summary "Update List (legacy v3 CSV)"
86 :> Capture "listId" ListId
91 :> AsyncJobs JobLog '[FormUrlEncoded] WithTextFile JobLog
93 csvApi :: ServerT CSVAPI (GargM Env GargError)
96 ------------------------------------------------------------------------
97 get :: HasNodeStory env err m =>
98 ListId -> m (Headers '[Header "Content-Disposition" Text] NgramsList)
100 lst <- getNgramsList lId
101 let (NodeId id') = lId
102 return $ addHeader (concat [ "attachment; filename=GarganText_NgramsList-"
108 ------------------------------------------------------------------------
111 setList :: FlowCmdM env err m
116 -- TODO check with Version for optim
117 printDebug "New list as file" l
118 _ <- mapM (\(nt, Versioned _v ns) -> setListNgrams l nt ns) $ toList m
122 ------------------------------------------------------------------------
123 -- | Re-index documents of a corpus with new ngrams (called orphans here)
124 reIndexWith :: ( HasNodeStory env err m
132 reIndexWith cId lId nt lts = do
133 -- Getting [NgramsTerm]
135 <$> map (\(k,vs) -> k:vs)
137 <$> getTermsWith identity [lId] nt lts
139 -- printDebug "ts" ts
141 -- Taking the ngrams with 0 occurrences only (orphans)
142 -- occs <- getOccByNgramsOnlyFast' cId lId nt ts
144 -- printDebug "occs" occs
146 let orphans = ts {- List.concat
147 $ map (\t -> case HashMap.lookup t occs of
149 Just n -> if n <= 1 then [t] else [ ]
152 -- printDebug "orphans" orphans
154 -- Get all documents of the corpus
155 docs <- selectDocNodes cId
156 -- printDebug "docs length" (List.length docs)
158 -- Checking Text documents where orphans match
161 ngramsByDoc = map (HashMap.fromListWith (<>))
162 $ map (map (\(k,v) -> (SimpleNgrams (text2ngrams k), v)))
163 $ map (\doc -> List.zip
164 (termsInText (buildPatterns $ map (\k -> (Text.splitOn " " $ unNgramsTerm k, [])) orphans)
165 $ Text.unlines $ catMaybes
166 [ doc ^. context_hyperdata . hd_title
167 , doc ^. context_hyperdata . hd_abstract
170 (List.cycle [Map.fromList $ [(nt, Map.singleton (doc ^. context_id) 1 )]])
173 -- printDebug "ngramsByDoc" ngramsByDoc
175 -- Saving the indexation in database
176 _ <- mapM (saveDocNgramsWith lId) ngramsByDoc
178 pure () -- ngramsByDoc
180 toIndexedNgrams :: HashMap Text NgramsId -> Text -> Maybe (Indexed Int Ngrams)
181 toIndexedNgrams m t = Indexed <$> i <*> n
183 i = HashMap.lookup t m
184 n = Just (text2ngrams t)
186 ------------------------------------------------------------------------
187 type PostAPI = Summary "Update List"
191 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
193 postAsync :: ListId -> ServerT PostAPI (GargM Env GargError)
195 serveJobsAPI UpdateNgramsListJobJSON $ \f log' ->
198 -- printDebug "postAsync ListId" x
200 in postAsync' lId f log''
202 postAsync' :: FlowCmdM env err m
207 postAsync' l (WithFile _ m _) logStatus = do
209 logStatus JobLog { _scst_succeeded = Just 0
210 , _scst_failed = Just 0
211 , _scst_remaining = Just 2
212 , _scst_events = Just []
214 printDebug "New list as file" l
216 -- printDebug "Done" r
218 logStatus JobLog { _scst_succeeded = Just 1
219 , _scst_failed = Just 0
220 , _scst_remaining = Just 1
221 , _scst_events = Just []
225 corpus_node <- getNode l -- (Proxy :: Proxy HyperdataList)
226 let corpus_id = fromMaybe (panic "") (_node_parent_id corpus_node)
227 _ <- reIndexWith corpus_id l NgramsTerms (Set.fromList [MapTerm, CandidateTerm])
229 pure JobLog { _scst_succeeded = Just 2
230 , _scst_failed = Just 0
231 , _scst_remaining = Just 0
232 , _scst_events = Just []
236 ------------------------------------------------------------------------
237 type CSVPostAPI = Summary "Update List (legacy v3 CSV)"
242 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
244 readCsvText :: Text -> [(Text, Text, Text)]
245 readCsvText t = case eDec of
247 Right dec -> Vec.toList dec
249 lt = BSL.fromStrict $ P.encodeUtf8 t
250 eDec = Csv.decodeWith
251 (Csv.defaultDecodeOptions { Csv.decDelimiter = fromIntegral (P.ord '\t') })
252 Csv.HasHeader lt :: Either Prelude.String (Vector (Text, Text, Text))
254 parseCsvData :: [(Text, Text, Text)] -> Map NgramsTerm NgramsRepoElement
255 parseCsvData lst = Map.fromList $ conv <$> lst
257 conv (status, label, forms) =
258 (NgramsTerm label, NgramsRepoElement { _nre_size = 1
259 , _nre_list = case status == "map" of
261 False -> case status == "main" of
262 True -> CandidateTerm
264 , _nre_root = Nothing
265 , _nre_parent = Nothing
266 , _nre_children = MSet
268 $ map (\form -> (NgramsTerm form, ()))
269 $ filter (\w -> w /= "" && w /= label)
270 $ splitOn "|&|" forms
274 csvPost :: FlowCmdM env err m
279 printDebug "[csvPost] l" l
280 -- printDebug "[csvPost] m" m
281 -- status label forms
282 let lst = readCsvText m
283 let p = parseCsvData lst
284 --printDebug "[csvPost] lst" lst
285 printDebug "[csvPost] p" p
286 _ <- setListNgrams l NgramsTerms p
287 printDebug "ReIndexing List" l
288 corpus_node <- getNode l -- (Proxy :: Proxy HyperdataList)
289 let corpus_id = fromMaybe (panic "") (_node_parent_id corpus_node)
290 _ <- reIndexWith corpus_id l NgramsTerms (Set.fromList [MapTerm, CandidateTerm])
294 ------------------------------------------------------------------------
295 csvPostAsync :: ServerT CSVAPI (GargM Env GargError)
297 serveJobsAPI UpdateNgramsListJobCSV $ \f@(WithTextFile ft _ n) log' -> do
299 printDebug "[csvPostAsync] filetype" ft
300 printDebug "[csvPostAsync] name" n
302 csvPostAsync' lId f log''
305 csvPostAsync' :: FlowCmdM env err m
310 csvPostAsync' l (WithTextFile _ m _) logStatus = do
311 logStatus JobLog { _scst_succeeded = Just 0
312 , _scst_failed = Just 0
313 , _scst_remaining = Just 1
314 , _scst_events = Just []
318 pure JobLog { _scst_succeeded = Just 1
319 , _scst_failed = Just 0
320 , _scst_remaining = Just 0
321 , _scst_events = Just []
323 ------------------------------------------------------------------------