2 Module : Gargantext.API.Ngrams.List
3 Description : Get Ngrams (lists)
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE MonoLocalBinds #-}
13 {-# LANGUAGE TemplateHaskell #-}
14 {-# LANGUAGE TypeOperators #-}
16 module Gargantext.API.Ngrams.List
19 import Control.Lens hiding (elements, Indexed)
21 import qualified Data.ByteString.Lazy as BSL
22 import qualified Data.Csv as Csv
23 import Data.Either (Either(..))
24 import Data.HashMap.Strict (HashMap)
25 import qualified Data.HashMap.Strict as HashMap
26 import qualified Data.List as List
27 import Data.Map (Map, toList, fromList)
28 import qualified Data.Map as Map
29 import Data.Maybe (catMaybes)
31 import Data.Text (Text, concat, pack)
32 import qualified Data.Text as Text
33 import Data.Vector (Vector)
34 import qualified Data.Vector as Vec
35 import Network.HTTP.Media ((//), (/:))
36 import qualified Prelude as Prelude
38 import Servant.Job.Async
40 import qualified Protolude as P
42 import Gargantext.API.Admin.Orchestrator.Types
43 import Gargantext.API.Ngrams (getNgramsTableMap, setListNgrams)
44 import Gargantext.API.Ngrams.List.Types
45 import Gargantext.API.Ngrams.Tools (getTermsWith)
46 import Gargantext.API.Ngrams.Types
47 import Gargantext.API.Prelude (GargServer)
48 import Gargantext.Core.Text.Terms (ExtractedNgrams(..))
49 import Gargantext.Core.Text.Terms.WithList (buildPatterns, termsInText)
50 import Gargantext.Core.Types.Main (ListType(..))
51 import Gargantext.Database.Action.Flow (saveDocNgramsWith)
52 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
53 import Gargantext.Database.Action.Metrics.NgramsByNode (getOccByNgramsOnlyFast')
54 import Gargantext.Database.Admin.Types.Hyperdata.Document
55 import Gargantext.Database.Admin.Types.Node
56 import Gargantext.Database.Query.Table.NodeNode (selectDocNodes)
57 import Gargantext.Database.Schema.Ngrams
58 import Gargantext.Database.Schema.Node
59 import Gargantext.Database.Types (Indexed(..))
60 import Gargantext.Prelude
63 ------------------------------------------------------------------------
64 get :: RepoCmdM env err m =>
65 ListId -> m (Headers '[Header "Content-Disposition" Text] NgramsList)
68 let (NodeId id') = lId
69 return $ addHeader (concat [ "attachment; filename=GarganText_NgramsList-"
75 get' :: RepoCmdM env err m
76 => ListId -> m NgramsList
79 <$> mapM (getNgramsTableMap lId) ngramsTypes
81 ------------------------------------------------------------------------
84 post :: FlowCmdM env err m
89 -- TODO check with Version for optim
90 _ <- mapM (\(nt, Versioned _v ns) -> setListNgrams l nt ns) $ toList m
95 -----------------------------------------------------------------------------
96 -- | Re-index documents of a corpus with new ngrams (called orphans here)
97 reIndexWith :: ( HasRepo env
105 reIndexWith cId lId nt lts = do
106 -- Getting [NgramsTerm]
108 <$> map (\(k,vs) -> k:vs)
110 <$> getTermsWith identity [lId] nt lts
112 -- printDebug "ts" ts
114 -- Taking the ngrams with 0 occurrences only (orphans)
115 occs <- getOccByNgramsOnlyFast' cId lId nt ts
117 -- printDebug "occs" occs
119 let orphans = List.concat
120 $ map (\t -> case HashMap.lookup t occs of
122 Just n -> if n <= 1 then [t] else [ ]
125 -- printDebug "orphans" orphans
127 -- Get all documents of the corpus
128 docs <- selectDocNodes cId
129 -- printDebug "docs length" (List.length docs)
131 -- Checking Text documents where orphans match
134 ngramsByDoc = map (HashMap.fromList)
135 $ map (map (\(k,v) -> (SimpleNgrams (text2ngrams k), v)))
136 $ map (\doc -> List.zip
137 (termsInText (buildPatterns $ map (\k -> (Text.splitOn " " $ unNgramsTerm k, [])) orphans)
138 $ Text.unlines $ catMaybes
139 [ doc ^. node_hyperdata . hd_title
140 , doc ^. node_hyperdata . hd_abstract
143 (List.cycle [Map.fromList $ [(nt, Map.singleton (doc ^. node_id) 1 )]])
146 -- printDebug "ngramsByDoc" ngramsByDoc
148 -- Saving the indexation in database
149 _ <- mapM (saveDocNgramsWith lId) ngramsByDoc
151 pure () -- ngramsByDoc
153 toIndexedNgrams :: HashMap Text NgramsId -> Text -> Maybe (Indexed Int Ngrams)
154 toIndexedNgrams m t = Indexed <$> i <*> n
156 i = HashMap.lookup t m
157 n = Just (text2ngrams t)
159 ------------------------------------------------------------------------
160 type PostAPI = Summary "Update List"
164 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
166 postAsync :: ListId -> GargServer PostAPI
169 JobFunction (\f log' -> postAsync' lId f (liftBase . log'))
171 postAsync' :: FlowCmdM env err m
176 postAsync' l (WithFile _ m _) logStatus = do
178 logStatus JobLog { _scst_succeeded = Just 0
179 , _scst_failed = Just 0
180 , _scst_remaining = Just 1
181 , _scst_events = Just []
185 pure JobLog { _scst_succeeded = Just 1
186 , _scst_failed = Just 0
187 , _scst_remaining = Just 0
188 , _scst_events = Just []
190 ------------------------------------------------------------------------
191 readCsvText :: Text -> [(Text, Text, Text)]
192 readCsvText t = case eDec of
194 Right dec -> Vec.toList dec
196 lt = BSL.fromStrict $ P.encodeUtf8 t
197 eDec = Csv.decodeWith
198 (Csv.defaultDecodeOptions { Csv.decDelimiter = fromIntegral (P.ord '\t') })
199 Csv.HasHeader lt :: Either Prelude.String (Vector (Text, Text, Text))
201 parseCsvData :: [(Text, Text, Text)] -> Map NgramsTerm NgramsRepoElement
202 parseCsvData lst = Map.fromList $ conv <$> lst
204 conv (_status, label, _forms) =
205 (NgramsTerm label, NgramsRepoElement { _nre_size = 1
206 , _nre_list = CandidateTerm
207 , _nre_root = Nothing
208 , _nre_parent = Nothing
209 , _nre_children = MSet Map.empty })
211 csvPost :: FlowCmdM env err m
216 printDebug "[csvPost] l" l
217 -- printDebug "[csvPost] m" m
218 -- status label forms
219 let lst = readCsvText m
220 let p = parseCsvData lst
221 --printDebug "[csvPost] lst" lst
222 --printDebug "[csvPost] p" p
223 _ <- setListNgrams l NgramsTerms p
225 ------------------------------------------------------------------------
226 type CSVPostAPI = Summary "Update List (legacy v3 CSV)"
231 :> AsyncJobs JobLog '[FormUrlEncoded] WithTextFile JobLog
233 csvPostAsync :: ListId -> GargServer CSVPostAPI
236 JobFunction $ \f@(WithTextFile ft _ n) log' -> do
238 printDebug "[csvPostAsync] filetype" ft
239 printDebug "[csvPostAsync] name" n
241 csvPostAsync' lId f log''
243 csvPostAsync' :: FlowCmdM env err m
248 csvPostAsync' l (WithTextFile _ m _) logStatus = do
249 logStatus JobLog { _scst_succeeded = Just 0
250 , _scst_failed = Just 0
251 , _scst_remaining = Just 1
252 , _scst_events = Just []
256 pure JobLog { _scst_succeeded = Just 1
257 , _scst_failed = Just 0
258 , _scst_remaining = Just 0
259 , _scst_events = Just []
262 ------------------------------------------------------------------------
263 type API = Get '[JSON, HTML] (Headers '[Header "Content-Disposition" Text] NgramsList)
264 -- :<|> ReqBody '[JSON] NgramsList :> Post '[JSON] Bool
268 api :: ListId -> GargServer API
269 api l = get l :<|> postAsync l :<|> csvPostAsync l
272 instance Accept HTML where
273 contentType _ = "text" // "html" /: ("charset", "utf-8")
274 instance ToJSON a => MimeRender HTML a where
275 mimeRender _ = encode