2 Module : Gargantext.API.Ngrams.List
3 Description : Get Ngrams (lists)
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE TemplateHaskell #-}
13 {-# LANGUAGE TypeOperators #-}
15 module Gargantext.API.Ngrams.List
18 import Control.Lens hiding (elements, Indexed)
20 import Data.Either (Either(..))
21 import Data.HashMap.Strict (HashMap)
22 import Data.Map (Map, toList, fromList)
23 import Data.Maybe (catMaybes)
25 import Data.Text (Text, concat, pack)
26 import Data.Vector (Vector)
27 import Gargantext.API.Admin.Orchestrator.Types
28 import Gargantext.API.Ngrams (getNgramsTableMap, setListNgrams)
29 import Gargantext.API.Ngrams.Tools (getTermsWith)
30 import Gargantext.API.Ngrams.Types
31 import Gargantext.API.Ngrams.List.Types
32 import Gargantext.API.Prelude (GargServer)
33 import Gargantext.Core.NodeStory
34 import Gargantext.Core.Text.Terms (ExtractedNgrams(..))
35 import Gargantext.Core.Text.Terms.WithList (buildPatterns, termsInText)
36 import Gargantext.Core.Types.Main (ListType(..))
37 import Gargantext.Database.Action.Flow (saveDocNgramsWith)
38 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
39 import Gargantext.Database.Action.Metrics.NgramsByContext (getOccByNgramsOnlyFast')
40 import Gargantext.Database.Admin.Types.Hyperdata.Document
41 import Gargantext.Database.Admin.Types.Node
42 import Gargantext.Database.Query.Table.NodeContext (selectDocNodes)
43 import Gargantext.Database.Schema.Ngrams
44 import Gargantext.Database.Schema.Node
45 import Gargantext.Database.Types (Indexed(..))
46 import Gargantext.Prelude
47 import Network.HTTP.Media ((//), (/:))
49 import Servant.Job.Async
50 import qualified Data.ByteString.Lazy as BSL
51 import qualified Data.Csv as Csv
52 import qualified Data.HashMap.Strict as HashMap
53 import qualified Data.List as List
54 import qualified Data.Map as Map
55 import qualified Data.Text as Text
56 import qualified Data.Vector as Vec
57 import qualified Prelude as Prelude
58 import qualified Protolude as P
59 ------------------------------------------------------------------------
62 type API = Get '[JSON, HTML] (Headers '[Header "Content-Disposition" Text] NgramsList)
63 -- :<|> ReqBody '[JSON] NgramsList :> Post '[JSON] Bool
66 api :: ListId -> GargServer API
67 api l = get l :<|> postAsync l :<|> csvPostAsync l
70 ----------------------
71 type GETAPI = Summary "Get List"
73 :> Capture "listId" ListId
74 :> Get '[JSON, HTML] (Headers '[Header "Content-Disposition" Text] NgramsList)
75 getApi :: GargServer GETAPI
79 instance Accept HTML where
80 contentType _ = "text" // "html" /: ("charset", "utf-8")
81 instance ToJSON a => MimeRender HTML a where
84 ----------------------
85 type JSONAPI = Summary "Update List"
87 :> Capture "listId" ListId
91 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
93 jsonApi :: GargServer JSONAPI
96 ----------------------
97 type CSVAPI = Summary "Update List (legacy v3 CSV)"
99 :> Capture "listId" ListId
104 :> AsyncJobs JobLog '[FormUrlEncoded] WithTextFile JobLog
106 csvApi :: GargServer CSVAPI
107 csvApi = csvPostAsync
109 ------------------------------------------------------------------------
110 get :: HasNodeStory env err m =>
111 ListId -> m (Headers '[Header "Content-Disposition" Text] NgramsList)
114 let (NodeId id') = lId
115 return $ addHeader (concat [ "attachment; filename=GarganText_NgramsList-"
121 get' :: HasNodeStory env err m
122 => ListId -> m NgramsList
125 <$> mapM (getNgramsTableMap lId) ngramsTypes
127 ------------------------------------------------------------------------
130 post :: FlowCmdM env err m
135 -- TODO check with Version for optim
136 printDebug "New list as file" l
137 _ <- mapM (\(nt, Versioned _v ns) -> setListNgrams l nt ns) $ toList m
141 ------------------------------------------------------------------------
142 -- | Re-index documents of a corpus with new ngrams (called orphans here)
143 reIndexWith :: ( HasNodeStory env err m
151 reIndexWith cId lId nt lts = do
152 -- Getting [NgramsTerm]
154 <$> map (\(k,vs) -> k:vs)
156 <$> getTermsWith identity [lId] nt lts
158 -- printDebug "ts" ts
160 -- Taking the ngrams with 0 occurrences only (orphans)
161 occs <- getOccByNgramsOnlyFast' cId lId nt ts
163 -- printDebug "occs" occs
165 let orphans = List.concat
166 $ map (\t -> case HashMap.lookup t occs of
168 Just n -> if n <= 1 then [t] else [ ]
171 -- printDebug "orphans" orphans
173 -- Get all documents of the corpus
174 docs <- selectDocNodes cId
175 -- printDebug "docs length" (List.length docs)
177 -- Checking Text documents where orphans match
180 ngramsByDoc = map (HashMap.fromList)
181 $ map (map (\(k,v) -> (SimpleNgrams (text2ngrams k), v)))
182 $ map (\doc -> List.zip
183 (termsInText (buildPatterns $ map (\k -> (Text.splitOn " " $ unNgramsTerm k, [])) orphans)
184 $ Text.unlines $ catMaybes
185 [ doc ^. node_hyperdata . hd_title
186 , doc ^. node_hyperdata . hd_abstract
189 (List.cycle [Map.fromList $ [(nt, Map.singleton (doc ^. node_id) 1 )]])
190 ) (map context2node docs)
192 -- printDebug "ngramsByDoc" ngramsByDoc
194 -- Saving the indexation in database
195 _ <- mapM (saveDocNgramsWith lId) ngramsByDoc
197 pure () -- ngramsByDoc
199 toIndexedNgrams :: HashMap Text NgramsId -> Text -> Maybe (Indexed Int Ngrams)
200 toIndexedNgrams m t = Indexed <$> i <*> n
202 i = HashMap.lookup t m
203 n = Just (text2ngrams t)
205 ------------------------------------------------------------------------
206 type PostAPI = Summary "Update List"
210 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
212 postAsync :: GargServer JSONAPI
215 JobFunction (\f log' ->
218 printDebug "postAsync ListId" x
220 in postAsync' lId f log'')
222 postAsync' :: FlowCmdM env err m
227 postAsync' l (WithFile _ m _) logStatus = do
229 logStatus JobLog { _scst_succeeded = Just 0
230 , _scst_failed = Just 0
231 , _scst_remaining = Just 1
232 , _scst_events = Just []
234 printDebug "New list as file" l
236 -- printDebug "Done" r
238 pure JobLog { _scst_succeeded = Just 1
239 , _scst_failed = Just 0
240 , _scst_remaining = Just 0
241 , _scst_events = Just []
243 ------------------------------------------------------------------------
245 type CSVPostAPI = Summary "Update List (legacy v3 CSV)"
250 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
252 readCsvText :: Text -> [(Text, Text, Text)]
253 readCsvText t = case eDec of
255 Right dec -> Vec.toList dec
257 lt = BSL.fromStrict $ P.encodeUtf8 t
258 eDec = Csv.decodeWith
259 (Csv.defaultDecodeOptions { Csv.decDelimiter = fromIntegral (P.ord '\t') })
260 Csv.HasHeader lt :: Either Prelude.String (Vector (Text, Text, Text))
262 parseCsvData :: [(Text, Text, Text)] -> Map NgramsTerm NgramsRepoElement
263 parseCsvData lst = Map.fromList $ conv <$> lst
265 conv (_status, label, _forms) =
266 (NgramsTerm label, NgramsRepoElement { _nre_size = 1
267 , _nre_list = CandidateTerm
268 , _nre_root = Nothing
269 , _nre_parent = Nothing
270 , _nre_children = MSet Map.empty })
272 csvPost :: FlowCmdM env err m
277 printDebug "[csvPost] l" l
278 -- printDebug "[csvPost] m" m
279 -- status label forms
280 let lst = readCsvText m
281 let p = parseCsvData lst
282 --printDebug "[csvPost] lst" lst
283 printDebug "[csvPost] p" p
284 _ <- setListNgrams l NgramsTerms p
286 ------------------------------------------------------------------------
290 csvPostAsync :: GargServer CSVAPI
293 JobFunction $ \f@(WithTextFile ft _ n) log' -> do
295 printDebug "[csvPostAsync] filetype" ft
296 printDebug "[csvPostAsync] name" n
298 csvPostAsync' lId f log''
301 csvPostAsync' :: FlowCmdM env err m
306 csvPostAsync' l (WithTextFile _ m _) logStatus = do
307 logStatus JobLog { _scst_succeeded = Just 0
308 , _scst_failed = Just 0
309 , _scst_remaining = Just 1
310 , _scst_events = Just []
314 pure JobLog { _scst_succeeded = Just 1
315 , _scst_failed = Just 0
316 , _scst_remaining = Just 0
317 , _scst_events = Just []
319 ------------------------------------------------------------------------