2 Module : Gargantext.API.Ngrams.List
3 Description : Get Ngrams (lists)
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE TemplateHaskell #-}
13 {-# LANGUAGE TypeOperators #-}
15 module Gargantext.API.Ngrams.List
18 import Control.Lens hiding (elements, Indexed)
20 import Data.Either (Either(..))
21 import Data.HashMap.Strict (HashMap)
22 import Data.Map (Map, toList)
23 import Data.Maybe (catMaybes)
25 import Data.Text (Text, concat, pack)
26 import Data.Vector (Vector)
27 import Gargantext.API.Admin.Orchestrator.Types
28 import Gargantext.API.Ngrams (setListNgrams)
29 import Gargantext.API.Ngrams.Tools (getTermsWith)
30 import Gargantext.API.Ngrams.Types
31 import Gargantext.API.Ngrams.Prelude (getNgramsList)
32 import Gargantext.API.Ngrams.List.Types
33 import Gargantext.API.Prelude (GargServer)
34 import Gargantext.Core.NodeStory
35 import Gargantext.Core.Text.Terms (ExtractedNgrams(..))
36 import Gargantext.Core.Text.Terms.WithList (buildPatterns, termsInText)
37 import Gargantext.Core.Types.Main (ListType(..))
38 import Gargantext.Database.Action.Flow (saveDocNgramsWith)
39 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
40 import Gargantext.Database.Action.Metrics.NgramsByContext (getOccByNgramsOnlyFast')
41 import Gargantext.Database.Admin.Types.Hyperdata.Document
42 import Gargantext.Database.Admin.Types.Node
43 import Gargantext.Database.Query.Table.NodeContext (selectDocNodes)
44 import Gargantext.Database.Schema.Ngrams
45 import Gargantext.Database.Schema.Context
46 import Gargantext.Database.Types (Indexed(..))
47 import Gargantext.Prelude
48 import Network.HTTP.Media ((//), (/:))
50 import Servant.Job.Async
51 import qualified Data.ByteString.Lazy as BSL
52 import qualified Data.Csv as Csv
53 import qualified Data.HashMap.Strict as HashMap
54 import qualified Data.List as List
55 import qualified Data.Map as Map
56 import qualified Data.Text as Text
57 import qualified Data.Vector as Vec
58 import qualified Prelude as Prelude
59 import qualified Protolude as P
60 ------------------------------------------------------------------------
63 type API = Get '[JSON, HTML] (Headers '[Header "Content-Disposition" Text] NgramsList)
64 -- :<|> ReqBody '[JSON] NgramsList :> Post '[JSON] Bool
67 api :: ListId -> GargServer API
68 api l = get l :<|> postAsync l :<|> csvPostAsync l
71 ----------------------
72 type GETAPI = Summary "Get List"
74 :> Capture "listId" ListId
75 :> Get '[JSON, HTML] (Headers '[Header "Content-Disposition" Text] NgramsList)
76 getApi :: GargServer GETAPI
80 instance Accept HTML where
81 contentType _ = "text" // "html" /: ("charset", "utf-8")
82 instance ToJSON a => MimeRender HTML a where
85 ----------------------
86 type JSONAPI = Summary "Update List"
88 :> Capture "listId" ListId
92 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
94 jsonApi :: GargServer JSONAPI
97 ----------------------
98 type CSVAPI = Summary "Update List (legacy v3 CSV)"
100 :> Capture "listId" ListId
105 :> AsyncJobs JobLog '[FormUrlEncoded] WithTextFile JobLog
107 csvApi :: GargServer CSVAPI
108 csvApi = csvPostAsync
110 ------------------------------------------------------------------------
111 get :: HasNodeStory env err m =>
112 ListId -> m (Headers '[Header "Content-Disposition" Text] NgramsList)
114 lst <- getNgramsList lId
115 let (NodeId id') = lId
116 return $ addHeader (concat [ "attachment; filename=GarganText_NgramsList-"
122 ------------------------------------------------------------------------
125 post :: FlowCmdM env err m
130 -- TODO check with Version for optim
131 printDebug "New list as file" l
132 _ <- mapM (\(nt, Versioned _v ns) -> setListNgrams l nt ns) $ toList m
136 ------------------------------------------------------------------------
137 -- | Re-index documents of a corpus with new ngrams (called orphans here)
138 reIndexWith :: ( HasNodeStory env err m
146 reIndexWith cId lId nt lts = do
147 -- Getting [NgramsTerm]
149 <$> map (\(k,vs) -> k:vs)
151 <$> getTermsWith identity [lId] nt lts
155 -- Taking the ngrams with 0 occurrences only (orphans)
156 occs <- getOccByNgramsOnlyFast' cId lId nt ts
158 printDebug "occs" occs
160 let orphans = List.concat
161 $ map (\t -> case HashMap.lookup t occs of
163 Just n -> if n <= 1 then [t] else [ ]
166 printDebug "orphans" orphans
168 -- Get all documents of the corpus
169 docs <- selectDocNodes cId
170 printDebug "docs length" (List.length docs)
172 -- Checking Text documents where orphans match
175 ngramsByDoc = map (HashMap.fromList)
176 $ map (map (\(k,v) -> (SimpleNgrams (text2ngrams k), v)))
177 $ map (\doc -> List.zip
178 (termsInText (buildPatterns $ map (\k -> (Text.splitOn " " $ unNgramsTerm k, [])) orphans)
179 $ Text.unlines $ catMaybes
180 [ doc ^. context_hyperdata . hd_title
181 , doc ^. context_hyperdata . hd_abstract
184 (List.cycle [Map.fromList $ [(nt, Map.singleton (doc ^. context_id) 1 )]])
187 printDebug "ngramsByDoc" ngramsByDoc
189 -- Saving the indexation in database
190 _ <- mapM (saveDocNgramsWith lId) ngramsByDoc
192 pure () -- ngramsByDoc
194 toIndexedNgrams :: HashMap Text NgramsId -> Text -> Maybe (Indexed Int Ngrams)
195 toIndexedNgrams m t = Indexed <$> i <*> n
197 i = HashMap.lookup t m
198 n = Just (text2ngrams t)
200 ------------------------------------------------------------------------
201 type PostAPI = Summary "Update List"
205 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
207 postAsync :: GargServer JSONAPI
210 JobFunction (\f log' ->
213 printDebug "postAsync ListId" x
215 in postAsync' lId f log'')
217 postAsync' :: FlowCmdM env err m
222 postAsync' l (WithFile _ m _) logStatus = do
224 logStatus JobLog { _scst_succeeded = Just 0
225 , _scst_failed = Just 0
226 , _scst_remaining = Just 1
227 , _scst_events = Just []
229 printDebug "New list as file" l
231 -- printDebug "Done" r
233 pure JobLog { _scst_succeeded = Just 1
234 , _scst_failed = Just 0
235 , _scst_remaining = Just 0
236 , _scst_events = Just []
238 ------------------------------------------------------------------------
240 type CSVPostAPI = Summary "Update List (legacy v3 CSV)"
245 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
247 readCsvText :: Text -> [(Text, Text, Text)]
248 readCsvText t = case eDec of
250 Right dec -> Vec.toList dec
252 lt = BSL.fromStrict $ P.encodeUtf8 t
253 eDec = Csv.decodeWith
254 (Csv.defaultDecodeOptions { Csv.decDelimiter = fromIntegral (P.ord '\t') })
255 Csv.HasHeader lt :: Either Prelude.String (Vector (Text, Text, Text))
257 parseCsvData :: [(Text, Text, Text)] -> Map NgramsTerm NgramsRepoElement
258 parseCsvData lst = Map.fromList $ conv <$> lst
260 conv (_status, label, _forms) =
261 (NgramsTerm label, NgramsRepoElement { _nre_size = 1
262 , _nre_list = CandidateTerm
263 , _nre_root = Nothing
264 , _nre_parent = Nothing
265 , _nre_children = MSet Map.empty })
267 csvPost :: FlowCmdM env err m
272 printDebug "[csvPost] l" l
273 -- printDebug "[csvPost] m" m
274 -- status label forms
275 let lst = readCsvText m
276 let p = parseCsvData lst
277 --printDebug "[csvPost] lst" lst
278 printDebug "[csvPost] p" p
279 _ <- setListNgrams l NgramsTerms p
281 ------------------------------------------------------------------------
285 csvPostAsync :: GargServer CSVAPI
288 JobFunction $ \f@(WithTextFile ft _ n) log' -> do
290 printDebug "[csvPostAsync] filetype" ft
291 printDebug "[csvPostAsync] name" n
293 csvPostAsync' lId f log''
296 csvPostAsync' :: FlowCmdM env err m
301 csvPostAsync' l (WithTextFile _ m _) logStatus = do
302 logStatus JobLog { _scst_succeeded = Just 0
303 , _scst_failed = Just 0
304 , _scst_remaining = Just 1
305 , _scst_events = Just []
309 pure JobLog { _scst_succeeded = Just 1
310 , _scst_failed = Just 0
311 , _scst_remaining = Just 0
312 , _scst_events = Just []
314 ------------------------------------------------------------------------