2 Module : Gargantext.API.Ngrams.List
3 Description : Get Ngrams (lists)
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE TemplateHaskell #-}
13 {-# LANGUAGE TypeOperators #-}
15 module Gargantext.API.Ngrams.List
18 import Control.Lens hiding (elements, Indexed)
19 import Data.Either (Either(..))
20 import Data.HashMap.Strict (HashMap)
21 import Data.Map (Map, toList)
22 import Data.Maybe (catMaybes, fromMaybe)
24 import Data.Text (Text, concat, pack, splitOn)
25 import Data.Vector (Vector)
26 import Gargantext.API.Admin.EnvTypes (Env, GargJob(..))
27 import Gargantext.API.Admin.Orchestrator.Types
28 import Gargantext.API.Ngrams (setListNgrams)
29 import Gargantext.API.Ngrams.List.Types
30 import Gargantext.API.Ngrams.Prelude (getNgramsList)
31 import Gargantext.API.Ngrams.Tools (getTermsWith)
32 import Gargantext.API.Ngrams.Types
33 import Gargantext.API.Prelude (GargServer, GargM, GargError)
34 import Gargantext.API.Types
35 import Gargantext.Core.NodeStory
36 import Gargantext.Core.Text.Terms (ExtractedNgrams(..))
37 import Gargantext.Core.Text.Terms.WithList (buildPatterns, termsInText)
38 import Gargantext.Core.Types.Main (ListType(..))
39 import Gargantext.Database.Action.Flow (saveDocNgramsWith)
40 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
41 import Gargantext.Database.Admin.Types.Hyperdata.Document
42 import Gargantext.Database.Admin.Types.Node
43 import Gargantext.Database.Query.Table.Node (getNode)
44 import Gargantext.Database.Query.Table.NodeContext (selectDocNodes)
45 import Gargantext.Database.Schema.Context
46 import Gargantext.Database.Schema.Ngrams
47 import Gargantext.Database.Schema.Node (_node_parent_id)
48 import Gargantext.Database.Types (Indexed(..))
49 import Gargantext.Prelude
50 import Gargantext.Utils.Jobs (serveJobsAPI)
52 -- import Servant.Job.Async
53 import qualified Data.ByteString.Lazy as BSL
54 import qualified Data.Csv as Csv
55 import qualified Data.HashMap.Strict as HashMap
56 import qualified Data.List as List
57 import qualified Data.Map as Map
58 import qualified Data.Set as Set
59 import qualified Data.Text as Text
60 import qualified Data.Vector as Vec
61 import qualified Prelude
62 import qualified Protolude as P
63 ------------------------------------------------------------------------
64 type GETAPI = Summary "Get List"
66 :> Capture "listId" ListId
67 :> Get '[JSON, HTML] (Headers '[Header "Content-Disposition" Text] NgramsList)
68 getApi :: GargServer GETAPI
71 ----------------------
72 type JSONAPI = Summary "Update List"
74 :> Capture "listId" ListId
78 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
80 jsonApi :: ServerT JSONAPI (GargM Env GargError)
83 ----------------------
84 type CSVAPI = Summary "Update List (legacy v3 CSV)"
86 :> Capture "listId" ListId
91 :> AsyncJobs JobLog '[FormUrlEncoded] WithTextFile JobLog
93 csvApi :: ServerT CSVAPI (GargM Env GargError)
96 ------------------------------------------------------------------------
97 get :: HasNodeStory env err m =>
98 ListId -> m (Headers '[Header "Content-Disposition" Text] NgramsList)
100 lst <- getNgramsList lId
101 let (NodeId id') = lId
102 return $ addHeader (concat [ "attachment; filename=GarganText_NgramsList-"
108 ------------------------------------------------------------------------
111 setList :: FlowCmdM env err m
116 -- TODO check with Version for optim
117 printDebug "New list as file" l
118 _ <- mapM (\(nt, Versioned _v ns) -> setListNgrams l nt ns) $ toList m
122 ------------------------------------------------------------------------
123 -- | Re-index documents of a corpus with new ngrams (called orphans here)
124 reIndexWith :: ( HasNodeStory env err m
132 reIndexWith cId lId nt lts = do
133 -- Getting [NgramsTerm]
135 <$> map (\(k,vs) -> k:vs)
137 <$> getTermsWith identity [lId] nt lts
139 -- printDebug "ts" ts
141 -- Taking the ngrams with 0 occurrences only (orphans)
142 -- occs <- getOccByNgramsOnlyFast' cId lId nt ts
144 -- printDebug "occs" occs
146 let orphans = ts {- List.concat
147 $ map (\t -> case HashMap.lookup t occs of
149 Just n -> if n <= 1 then [t] else [ ]
152 -- printDebug "orphans" orphans
154 -- Get all documents of the corpus
155 docs <- selectDocNodes cId
156 -- printDebug "docs length" (List.length docs)
158 -- Checking Text documents where orphans match
162 ngramsByDoc = map (HashMap.fromList)
163 $ map (map (\((k, cnt), v) -> (SimpleNgrams (text2ngrams k), over (traverse . traverse) (\p -> (p, cnt)) v)))
164 $ map (\doc -> List.zip
165 (termsInText (buildPatterns $ map (\k -> (Text.splitOn " " $ unNgramsTerm k, [])) orphans)
166 $ Text.unlines $ catMaybes
167 [ doc ^. context_hyperdata . hd_title
168 , doc ^. context_hyperdata . hd_abstract
171 (List.cycle [Map.fromList $ [(nt, Map.singleton (doc ^. context_id) 1 )]])
174 -- printDebug "ngramsByDoc" ngramsByDoc
176 -- Saving the indexation in database
177 _ <- mapM (saveDocNgramsWith lId) ngramsByDoc
179 pure () -- ngramsByDoc
181 toIndexedNgrams :: HashMap Text NgramsId -> Text -> Maybe (Indexed Int Ngrams)
182 toIndexedNgrams m t = Indexed <$> i <*> n
184 i = HashMap.lookup t m
185 n = Just (text2ngrams t)
187 ------------------------------------------------------------------------
188 type PostAPI = Summary "Update List"
192 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
194 postAsync :: ListId -> ServerT PostAPI (GargM Env GargError)
196 serveJobsAPI UpdateNgramsListJobJSON $ \f log' ->
199 -- printDebug "postAsync ListId" x
201 in postAsync' lId f log''
203 postAsync' :: FlowCmdM env err m
208 postAsync' l (WithFile _ m _) logStatus = do
210 logStatus JobLog { _scst_succeeded = Just 0
211 , _scst_failed = Just 0
212 , _scst_remaining = Just 2
213 , _scst_events = Just []
215 printDebug "New list as file" l
217 -- printDebug "Done" r
219 logStatus JobLog { _scst_succeeded = Just 1
220 , _scst_failed = Just 0
221 , _scst_remaining = Just 1
222 , _scst_events = Just []
226 corpus_node <- getNode l -- (Proxy :: Proxy HyperdataList)
227 let corpus_id = fromMaybe (panic "") (_node_parent_id corpus_node)
228 _ <- reIndexWith corpus_id l NgramsTerms (Set.fromList [MapTerm, CandidateTerm])
230 pure JobLog { _scst_succeeded = Just 2
231 , _scst_failed = Just 0
232 , _scst_remaining = Just 0
233 , _scst_events = Just []
237 ------------------------------------------------------------------------
238 type CSVPostAPI = Summary "Update List (legacy v3 CSV)"
243 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
245 readCsvText :: Text -> [(Text, Text, Text)]
246 readCsvText t = case eDec of
248 Right dec -> Vec.toList dec
250 lt = BSL.fromStrict $ P.encodeUtf8 t
251 eDec = Csv.decodeWith
252 (Csv.defaultDecodeOptions { Csv.decDelimiter = fromIntegral (P.ord '\t') })
253 Csv.HasHeader lt :: Either Prelude.String (Vector (Text, Text, Text))
255 parseCsvData :: [(Text, Text, Text)] -> Map NgramsTerm NgramsRepoElement
256 parseCsvData lst = Map.fromList $ conv <$> lst
258 conv (status, label, forms) =
259 (NgramsTerm label, NgramsRepoElement { _nre_size = 1
260 , _nre_list = case status == "map" of
262 False -> case status == "main" of
263 True -> CandidateTerm
265 , _nre_root = Nothing
266 , _nre_parent = Nothing
267 , _nre_children = MSet
269 $ map (\form -> (NgramsTerm form, ()))
270 $ filter (\w -> w /= "" && w /= label)
271 $ splitOn "|&|" forms
275 csvPost :: FlowCmdM env err m
280 printDebug "[csvPost] l" l
281 -- printDebug "[csvPost] m" m
282 -- status label forms
283 let lst = readCsvText m
284 let p = parseCsvData lst
285 --printDebug "[csvPost] lst" lst
286 printDebug "[csvPost] p" p
287 _ <- setListNgrams l NgramsTerms p
288 printDebug "ReIndexing List" l
289 corpus_node <- getNode l -- (Proxy :: Proxy HyperdataList)
290 let corpus_id = fromMaybe (panic "") (_node_parent_id corpus_node)
291 _ <- reIndexWith corpus_id l NgramsTerms (Set.fromList [MapTerm, CandidateTerm])
295 ------------------------------------------------------------------------
296 csvPostAsync :: ServerT CSVAPI (GargM Env GargError)
298 serveJobsAPI UpdateNgramsListJobCSV $ \f@(WithTextFile ft _ n) log' -> do
300 printDebug "[csvPostAsync] filetype" ft
301 printDebug "[csvPostAsync] name" n
303 csvPostAsync' lId f log''
306 csvPostAsync' :: FlowCmdM env err m
311 csvPostAsync' l (WithTextFile _ m _) logStatus = do
312 logStatus JobLog { _scst_succeeded = Just 0
313 , _scst_failed = Just 0
314 , _scst_remaining = Just 1
315 , _scst_events = Just []
319 pure JobLog { _scst_succeeded = Just 1
320 , _scst_failed = Just 0
321 , _scst_remaining = Just 0
322 , _scst_events = Just []
324 ------------------------------------------------------------------------