2 Module : Gargantext.API.Ngrams.List
3 Description : Get Ngrams (lists)
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE TemplateHaskell #-}
13 {-# LANGUAGE TypeOperators #-}
15 module Gargantext.API.Ngrams.List
18 import Control.Lens hiding (elements, Indexed)
20 import Data.Either (Either(..))
21 import Data.HashMap.Strict (HashMap)
22 import Data.Map (Map, toList)
23 import Data.Maybe (catMaybes, fromMaybe)
25 import Data.Text (Text, concat, pack, splitOn)
26 import Data.Vector (Vector)
27 import Gargantext.API.Admin.Orchestrator.Types
28 import Gargantext.API.Ngrams (setListNgrams)
29 import Gargantext.API.Ngrams.List.Types
30 import Gargantext.API.Ngrams.Prelude (getNgramsList)
31 import Gargantext.API.Ngrams.Tools (getTermsWith)
32 import Gargantext.API.Ngrams.Types
33 import Gargantext.API.Prelude (GargServer)
34 import Gargantext.Core.NodeStory
35 import Gargantext.Core.Text.Terms (ExtractedNgrams(..))
36 import Gargantext.Core.Text.Terms.WithList (buildPatterns, termsInText)
37 import Gargantext.Core.Types.Main (ListType(..))
38 import Gargantext.Database.Action.Flow (saveDocNgramsWith)
39 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
40 import Gargantext.Database.Action.Metrics.NgramsByContext (getOccByNgramsOnlyFast')
41 import Gargantext.Database.Admin.Types.Hyperdata.Document
42 import Gargantext.Database.Admin.Types.Node
43 import Gargantext.Database.Query.Table.Node (getNode)
44 import Gargantext.Database.Query.Table.NodeContext (selectDocNodes)
45 import Gargantext.Database.Schema.Context
46 import Gargantext.Database.Schema.Ngrams
47 import Gargantext.Database.Schema.Node (_node_parent_id)
48 import Gargantext.Database.Types (Indexed(..))
49 import Gargantext.Prelude
50 import Network.HTTP.Media ((//), (/:))
52 import Servant.Job.Async
53 import qualified Data.ByteString.Lazy as BSL
54 import qualified Data.Csv as Csv
55 import qualified Data.HashMap.Strict as HashMap
56 import qualified Data.List as List
57 import qualified Data.Map as Map
58 import qualified Data.Set as Set
59 import qualified Data.Text as Text
60 import qualified Data.Vector as Vec
61 import qualified Prelude
62 import qualified Protolude as P
63 ------------------------------------------------------------------------
64 type GETAPI = Summary "Get List"
66 :> Capture "listId" ListId
67 :> Get '[JSON, HTML] (Headers '[Header "Content-Disposition" Text] NgramsList)
68 getApi :: GargServer GETAPI
72 instance Accept HTML where
73 contentType _ = "text" // "html" /: ("charset", "utf-8")
74 instance ToJSON a => MimeRender HTML a where
77 ----------------------
78 type JSONAPI = Summary "Update List"
80 :> Capture "listId" ListId
84 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
86 jsonApi :: GargServer JSONAPI
89 ----------------------
90 type CSVAPI = Summary "Update List (legacy v3 CSV)"
92 :> Capture "listId" ListId
97 :> AsyncJobs JobLog '[FormUrlEncoded] WithTextFile JobLog
99 csvApi :: GargServer CSVAPI
100 csvApi = csvPostAsync
102 ------------------------------------------------------------------------
103 get :: HasNodeStory env err m =>
104 ListId -> m (Headers '[Header "Content-Disposition" Text] NgramsList)
106 lst <- getNgramsList lId
107 let (NodeId id') = lId
108 return $ addHeader (concat [ "attachment; filename=GarganText_NgramsList-"
114 ------------------------------------------------------------------------
117 setList :: FlowCmdM env err m
122 -- TODO check with Version for optim
123 printDebug "New list as file" l
124 _ <- mapM (\(nt, Versioned _v ns) -> setListNgrams l nt ns) $ toList m
128 ------------------------------------------------------------------------
129 -- | Re-index documents of a corpus with new ngrams (called orphans here)
130 reIndexWith :: ( HasNodeStory env err m
138 reIndexWith cId lId nt lts = do
139 -- Getting [NgramsTerm]
141 <$> map (\(k,vs) -> k:vs)
143 <$> getTermsWith identity [lId] nt lts
145 -- printDebug "ts" ts
147 -- Taking the ngrams with 0 occurrences only (orphans)
148 occs <- getOccByNgramsOnlyFast' cId lId nt ts
150 -- printDebug "occs" occs
152 let orphans = List.concat
153 $ map (\t -> case HashMap.lookup t occs of
155 Just n -> if n <= 1 then [t] else [ ]
158 -- printDebug "orphans" orphans
160 -- Get all documents of the corpus
161 docs <- selectDocNodes cId
162 -- printDebug "docs length" (List.length docs)
164 -- Checking Text documents where orphans match
167 ngramsByDoc = map (HashMap.fromList)
168 $ map (map (\(k,v) -> (SimpleNgrams (text2ngrams k), v)))
169 $ map (\doc -> List.zip
170 (termsInText (buildPatterns $ map (\k -> (Text.splitOn " " $ unNgramsTerm k, [])) orphans)
171 $ Text.unlines $ catMaybes
172 [ doc ^. context_hyperdata . hd_title
173 , doc ^. context_hyperdata . hd_abstract
176 (List.cycle [Map.fromList $ [(nt, Map.singleton (doc ^. context_id) 1 )]])
179 -- printDebug "ngramsByDoc" ngramsByDoc
181 -- Saving the indexation in database
182 _ <- mapM (saveDocNgramsWith lId) ngramsByDoc
184 pure () -- ngramsByDoc
186 toIndexedNgrams :: HashMap Text NgramsId -> Text -> Maybe (Indexed Int Ngrams)
187 toIndexedNgrams m t = Indexed <$> i <*> n
189 i = HashMap.lookup t m
190 n = Just (text2ngrams t)
192 ------------------------------------------------------------------------
193 type PostAPI = Summary "Update List"
197 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
199 postAsync :: GargServer JSONAPI
202 JobFunction (\f log' ->
205 -- printDebug "postAsync ListId" x
207 in postAsync' lId f log'')
209 postAsync' :: FlowCmdM env err m
214 postAsync' l (WithFile _ m _) logStatus = do
216 logStatus JobLog { _scst_succeeded = Just 0
217 , _scst_failed = Just 0
218 , _scst_remaining = Just 2
219 , _scst_events = Just []
221 printDebug "New list as file" l
223 -- printDebug "Done" r
225 logStatus JobLog { _scst_succeeded = Just 1
226 , _scst_failed = Just 0
227 , _scst_remaining = Just 1
228 , _scst_events = Just []
232 corpus_node <- getNode l -- (Proxy :: Proxy HyperdataList)
233 let corpus_id = fromMaybe (panic "") (_node_parent_id corpus_node)
234 _ <- reIndexWith corpus_id l NgramsTerms (Set.fromList [MapTerm, CandidateTerm])
236 pure JobLog { _scst_succeeded = Just 2
237 , _scst_failed = Just 0
238 , _scst_remaining = Just 0
239 , _scst_events = Just []
243 ------------------------------------------------------------------------
244 type CSVPostAPI = Summary "Update List (legacy v3 CSV)"
249 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
251 readCsvText :: Text -> [(Text, Text, Text)]
252 readCsvText t = case eDec of
254 Right dec -> Vec.toList dec
256 lt = BSL.fromStrict $ P.encodeUtf8 t
257 eDec = Csv.decodeWith
258 (Csv.defaultDecodeOptions { Csv.decDelimiter = fromIntegral (P.ord '\t') })
259 Csv.HasHeader lt :: Either Prelude.String (Vector (Text, Text, Text))
261 parseCsvData :: [(Text, Text, Text)] -> Map NgramsTerm NgramsRepoElement
262 parseCsvData lst = Map.fromList $ conv <$> lst
264 conv (status, label, forms) =
265 (NgramsTerm label, NgramsRepoElement { _nre_size = 1
266 , _nre_list = case status == "map" of
268 False -> case status == "main" of
269 True -> CandidateTerm
271 , _nre_root = Nothing
272 , _nre_parent = Nothing
273 , _nre_children = MSet
275 $ map (\form -> (NgramsTerm form, ()))
277 $ splitOn "|&|" forms
281 csvPost :: FlowCmdM env err m
286 printDebug "[csvPost] l" l
287 -- printDebug "[csvPost] m" m
288 -- status label forms
289 let lst = readCsvText m
290 let p = parseCsvData lst
291 --printDebug "[csvPost] lst" lst
292 printDebug "[csvPost] p" p
293 _ <- setListNgrams l NgramsTerms p
294 printDebug "ReIndexing List" l
295 corpus_node <- getNode l -- (Proxy :: Proxy HyperdataList)
296 let corpus_id = fromMaybe (panic "") (_node_parent_id corpus_node)
297 _ <- reIndexWith corpus_id l NgramsTerms (Set.fromList [MapTerm, CandidateTerm])
301 ------------------------------------------------------------------------
302 csvPostAsync :: GargServer CSVAPI
305 JobFunction $ \f@(WithTextFile ft _ n) log' -> do
307 printDebug "[csvPostAsync] filetype" ft
308 printDebug "[csvPostAsync] name" n
310 csvPostAsync' lId f log''
313 csvPostAsync' :: FlowCmdM env err m
318 csvPostAsync' l (WithTextFile _ m _) logStatus = do
319 logStatus JobLog { _scst_succeeded = Just 0
320 , _scst_failed = Just 0
321 , _scst_remaining = Just 1
322 , _scst_events = Just []
326 pure JobLog { _scst_succeeded = Just 1
327 , _scst_failed = Just 0
328 , _scst_remaining = Just 0
329 , _scst_events = Just []
331 ------------------------------------------------------------------------