2 Module : Gargantext.API.Ngrams.List
3 Description : Get Ngrams (lists)
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE MonoLocalBinds #-}
13 {-# LANGUAGE TemplateHaskell #-}
14 {-# LANGUAGE TypeOperators #-}
16 module Gargantext.API.Ngrams.List
19 import Control.Lens hiding (elements, Indexed)
21 import qualified Data.ByteString.Lazy as BSL
22 import qualified Data.Csv as Csv
23 import Data.Either (Either(..))
24 import Data.HashMap.Strict (HashMap)
25 import qualified Data.HashMap.Strict as HashMap
26 import qualified Data.List as List
27 import Data.Map (Map, toList, fromList)
28 import qualified Data.Map as Map
29 import Data.Maybe (catMaybes)
31 import Data.Text (Text, concat, pack)
32 import qualified Data.Text as Text
33 import Data.Vector (Vector)
34 import qualified Data.Vector as Vec
35 import Network.HTTP.Media ((//), (/:))
36 import qualified Prelude as Prelude
38 import Servant.Job.Async
40 import qualified Protolude as P
42 import Gargantext.API.Admin.Orchestrator.Types
43 import Gargantext.API.Ngrams (getNgramsTableMap, setListNgrams)
44 import Gargantext.API.Ngrams.List.Types
45 import Gargantext.API.Ngrams.Tools (getTermsWith)
46 import Gargantext.API.Ngrams.Types
47 import Gargantext.API.Prelude (GargServer)
48 import Gargantext.Core.Text.Terms (ExtractedNgrams(..))
49 import Gargantext.Core.Text.Terms.WithList (buildPatterns, termsInText)
50 import Gargantext.Core.Types.Main (ListType(..))
51 import Gargantext.Database.Action.Flow (saveDocNgramsWith)
52 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
53 import Gargantext.Database.Action.Metrics.NgramsByNode (getOccByNgramsOnlyFast')
54 import Gargantext.Database.Admin.Types.Hyperdata.Document
55 import Gargantext.Database.Admin.Types.Node
56 import Gargantext.Database.Query.Table.NodeNode (selectDocNodes)
57 import Gargantext.Database.Schema.Ngrams
58 import Gargantext.Database.Schema.Node
59 import Gargantext.Database.Types (Indexed(..))
60 import Gargantext.Prelude
62 ------------------------------------------------------------------------
65 type API = Get '[JSON, HTML] (Headers '[Header "Content-Disposition" Text] NgramsList)
66 -- :<|> ReqBody '[JSON] NgramsList :> Post '[JSON] Bool
70 api :: ListId -> GargServer API
71 api l = get l :<|> postAsync l :<|> csvPostAsync l
75 ----------------------
76 type GETAPI = Summary "Get List"
78 :> Capture "listId" ListId
79 :> Get '[JSON, HTML] (Headers '[Header "Content-Disposition" Text] NgramsList)
80 getApi :: GargServer GETAPI
84 instance Accept HTML where
85 contentType _ = "text" // "html" /: ("charset", "utf-8")
86 instance ToJSON a => MimeRender HTML a where
90 ----------------------
91 type JSONAPI = Summary "Update List"
93 :> Capture "listId" ListId
97 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
99 jsonApi :: GargServer JSONAPI
102 ----------------------
103 type CSVAPI = Summary "Update List (legacy v3 CSV)"
105 :> Capture "listId" ListId
110 :> AsyncJobs JobLog '[FormUrlEncoded] WithTextFile JobLog
112 csvApi :: GargServer CSVAPI
113 csvApi = csvPostAsync
115 ----------------------
119 ------------------------------------------------------------------------
120 get :: RepoCmdM env err m =>
121 ListId -> m (Headers '[Header "Content-Disposition" Text] NgramsList)
124 let (NodeId id') = lId
125 return $ addHeader (concat [ "attachment; filename=GarganText_NgramsList-"
131 get' :: RepoCmdM env err m
132 => ListId -> m NgramsList
135 <$> mapM (getNgramsTableMap lId) ngramsTypes
137 ------------------------------------------------------------------------
140 post :: FlowCmdM env err m
145 -- TODO check with Version for optim
146 printDebug "New list as file" l
147 _ <- mapM (\(nt, Versioned _v ns) -> setListNgrams l nt ns) $ toList m
152 -----------------------------------------------------------------------------
153 -- | Re-index documents of a corpus with new ngrams (called orphans here)
154 reIndexWith :: ( HasRepo env
162 reIndexWith cId lId nt lts = do
163 -- Getting [NgramsTerm]
165 <$> map (\(k,vs) -> k:vs)
167 <$> getTermsWith identity [lId] nt lts
169 -- printDebug "ts" ts
171 -- Taking the ngrams with 0 occurrences only (orphans)
172 occs <- getOccByNgramsOnlyFast' cId lId nt ts
174 -- printDebug "occs" occs
176 let orphans = List.concat
177 $ map (\t -> case HashMap.lookup t occs of
179 Just n -> if n <= 1 then [t] else [ ]
182 -- printDebug "orphans" orphans
184 -- Get all documents of the corpus
185 docs <- selectDocNodes cId
186 -- printDebug "docs length" (List.length docs)
188 -- Checking Text documents where orphans match
191 ngramsByDoc = map (HashMap.fromList)
192 $ map (map (\(k,v) -> (SimpleNgrams (text2ngrams k), v)))
193 $ map (\doc -> List.zip
194 (termsInText (buildPatterns $ map (\k -> (Text.splitOn " " $ unNgramsTerm k, [])) orphans)
195 $ Text.unlines $ catMaybes
196 [ doc ^. node_hyperdata . hd_title
197 , doc ^. node_hyperdata . hd_abstract
200 (List.cycle [Map.fromList $ [(nt, Map.singleton (doc ^. node_id) 1 )]])
203 -- printDebug "ngramsByDoc" ngramsByDoc
205 -- Saving the indexation in database
206 _ <- mapM (saveDocNgramsWith lId) ngramsByDoc
208 pure () -- ngramsByDoc
210 toIndexedNgrams :: HashMap Text NgramsId -> Text -> Maybe (Indexed Int Ngrams)
211 toIndexedNgrams m t = Indexed <$> i <*> n
213 i = HashMap.lookup t m
214 n = Just (text2ngrams t)
216 ------------------------------------------------------------------------
217 type PostAPI = Summary "Update List"
221 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
223 postAsync :: GargServer JSONAPI
226 JobFunction (\f log' ->
229 printDebug "postAsync ListId" x
231 in postAsync' lId f log'')
233 postAsync' :: FlowCmdM env err m
238 postAsync' l (WithFile _ m _) logStatus = do
240 logStatus JobLog { _scst_succeeded = Just 0
241 , _scst_failed = Just 0
242 , _scst_remaining = Just 1
243 , _scst_events = Just []
245 printDebug "New list as file" l
247 -- printDebug "Done" r
249 pure JobLog { _scst_succeeded = Just 1
250 , _scst_failed = Just 0
251 , _scst_remaining = Just 0
252 , _scst_events = Just []
254 ------------------------------------------------------------------------
255 readCsvText :: Text -> [(Text, Text, Text)]
256 readCsvText t = case eDec of
258 Right dec -> Vec.toList dec
260 lt = BSL.fromStrict $ P.encodeUtf8 t
261 eDec = Csv.decodeWith
262 (Csv.defaultDecodeOptions { Csv.decDelimiter = fromIntegral (P.ord '\t') })
263 Csv.HasHeader lt :: Either Prelude.String (Vector (Text, Text, Text))
265 parseCsvData :: [(Text, Text, Text)] -> Map NgramsTerm NgramsRepoElement
266 parseCsvData lst = Map.fromList $ conv <$> lst
268 conv (_status, label, _forms) =
269 (NgramsTerm label, NgramsRepoElement { _nre_size = 1
270 , _nre_list = CandidateTerm
271 , _nre_root = Nothing
272 , _nre_parent = Nothing
273 , _nre_children = MSet Map.empty })
275 csvPost :: FlowCmdM env err m
280 printDebug "[csvPost] l" l
281 -- printDebug "[csvPost] m" m
282 -- status label forms
283 let lst = readCsvText m
284 let p = parseCsvData lst
285 --printDebug "[csvPost] lst" lst
286 printDebug "[csvPost] p" p
287 _ <- setListNgrams l NgramsTerms p
289 ------------------------------------------------------------------------
293 csvPostAsync :: GargServer CSVAPI
296 JobFunction $ \f@(WithTextFile ft _ n) log' -> do
298 printDebug "[csvPostAsync] filetype" ft
299 printDebug "[csvPostAsync] name" n
301 csvPostAsync' lId f log''
303 csvPostAsync' :: FlowCmdM env err m
308 csvPostAsync' l (WithTextFile _ m _) logStatus = do
309 logStatus JobLog { _scst_succeeded = Just 0
310 , _scst_failed = Just 0
311 , _scst_remaining = Just 1
312 , _scst_events = Just []
316 pure JobLog { _scst_succeeded = Just 1
317 , _scst_failed = Just 0
318 , _scst_remaining = Just 0
319 , _scst_events = Just []
322 ------------------------------------------------------------------------