2 Module : Gargantext.API.Ngrams.List
3 Description : Get Ngrams (lists)
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE TemplateHaskell #-}
13 {-# LANGUAGE TypeOperators #-}
15 module Gargantext.API.Ngrams.List
18 import Control.Lens hiding (elements, Indexed)
20 import Data.HashMap.Strict (HashMap)
21 import Data.Map (toList, fromList)
22 import Data.Maybe (catMaybes)
24 import Data.Swagger (ToSchema, declareNamedSchema, genericDeclareNamedSchema)
25 import Data.Text (Text, concat, pack)
26 import GHC.Generics (Generic)
27 import Network.HTTP.Media ((//), (/:))
29 import Servant.Job.Async
30 import Servant.Job.Utils (jsonOptions)
31 import Web.FormUrlEncoded (FromForm)
32 import qualified Data.HashMap.Strict as HashMap
33 import qualified Data.List as List
34 import qualified Data.Map as Map
35 import qualified Data.Text as Text
37 import Gargantext.API.Admin.Orchestrator.Types
38 import Gargantext.API.Ngrams (getNgramsTableMap, setListNgrams)
39 import Gargantext.API.Ngrams.Tools (getTermsWith)
40 import Gargantext.API.Ngrams.Types
41 import Gargantext.API.Node.Corpus.New.File (FileType(..))
42 import Gargantext.API.Prelude (GargServer)
43 import Gargantext.Core.Text.Terms (ExtractedNgrams(..))
44 import Gargantext.Core.Text.Terms.WithList (buildPatterns, termsInText)
45 import Gargantext.Core.Types.Main (ListType(..))
46 import Gargantext.Core.Utils.Prefix (unPrefixSwagger)
47 import Gargantext.Database.Action.Flow (saveDocNgramsWith)
48 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
49 import Gargantext.Database.Action.Metrics.NgramsByNode (getOccByNgramsOnlyFast')
50 import Gargantext.Database.Admin.Types.Hyperdata.Document
51 import Gargantext.Database.Admin.Types.Node
52 import Gargantext.Database.Query.Table.NodeNode (selectDocNodes)
53 import Gargantext.Database.Schema.Ngrams
54 import Gargantext.Database.Schema.Node
55 import Gargantext.Database.Types (Indexed(..))
56 import Gargantext.Prelude
58 ------------------------------------------------------------------------
59 type API = Get '[JSON, HTML] (Headers '[Header "Content-Disposition" Text] NgramsList)
60 -- :<|> ReqBody '[JSON] NgramsList :> Post '[JSON] Bool
64 api :: ListId -> GargServer API
65 api l = get l :<|> postAsync l :<|> csvPostAsync l
68 instance Accept HTML where
69 contentType _ = "text" // "html" /: ("charset", "utf-8")
70 instance ToJSON a => MimeRender HTML a where
73 ------------------------------------------------------------------------
74 get :: RepoCmdM env err m =>
75 ListId -> m (Headers '[Header "Content-Disposition" Text] NgramsList)
78 let (NodeId id') = lId
79 return $ addHeader (concat [ "attachment; filename=GarganText_NgramsList-"
85 get' :: RepoCmdM env err m
86 => ListId -> m NgramsList
89 <$> mapM (getNgramsTableMap lId) ngramsTypes
91 ------------------------------------------------------------------------
94 post :: FlowCmdM env err m
99 -- TODO check with Version for optim
100 _ <- mapM (\(nt, Versioned _v ns) -> setListNgrams l nt ns) $ toList m
104 ------------------------------------------------------------------------
105 csvPost :: FlowCmdM env err m
110 printDebug "[csvPost] l" l
111 printDebug "[csvPost] m" m
114 -----------------------------------------------------------------------------
115 -- | Re-index documents of a corpus with new ngrams (called orphans here)
116 reIndexWith :: ( HasRepo env
124 reIndexWith cId lId nt lts = do
125 -- Getting [NgramsTerm]
127 <$> map (\(k,vs) -> k:vs)
129 <$> getTermsWith identity [lId] nt lts
131 -- printDebug "ts" ts
133 -- Taking the ngrams with 0 occurrences only (orphans)
134 occs <- getOccByNgramsOnlyFast' cId lId nt ts
136 -- printDebug "occs" occs
138 let orphans = List.concat
139 $ map (\t -> case HashMap.lookup t occs of
141 Just n -> if n <= 1 then [t] else [ ]
144 -- printDebug "orphans" orphans
146 -- Get all documents of the corpus
147 docs <- selectDocNodes cId
148 -- printDebug "docs length" (List.length docs)
150 -- Checking Text documents where orphans match
153 ngramsByDoc = map (HashMap.fromList)
154 $ map (map (\(k,v) -> (SimpleNgrams (text2ngrams k), v)))
155 $ map (\doc -> List.zip
156 (termsInText (buildPatterns $ map (\k -> (Text.splitOn " " $ unNgramsTerm k, [])) orphans)
157 $ Text.unlines $ catMaybes
158 [ doc ^. node_hyperdata . hd_title
159 , doc ^. node_hyperdata . hd_abstract
162 (List.cycle [Map.fromList $ [(nt, Map.singleton (doc ^. node_id) 1 )]])
165 -- printDebug "ngramsByDoc" ngramsByDoc
167 -- Saving the indexation in database
168 _ <- mapM (saveDocNgramsWith lId) ngramsByDoc
170 pure () -- ngramsByDoc
172 toIndexedNgrams :: HashMap Text NgramsId -> Text -> Maybe (Indexed Int Ngrams)
173 toIndexedNgrams m t = Indexed <$> i <*> n
175 i = HashMap.lookup t m
176 n = Just (text2ngrams t)
178 ------------------------------------------------------------------------
179 type PostAPI = Summary "Update List"
183 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
185 postAsync :: ListId -> GargServer PostAPI
188 JobFunction (\f log' -> postAsync' lId f (liftBase . log'))
190 postAsync' :: FlowCmdM env err m
195 postAsync' l (WithFile _ m _) logStatus = do
197 logStatus JobLog { _scst_succeeded = Just 0
198 , _scst_failed = Just 0
199 , _scst_remaining = Just 1
200 , _scst_events = Just []
204 pure JobLog { _scst_succeeded = Just 1
205 , _scst_failed = Just 0
206 , _scst_remaining = Just 0
207 , _scst_events = Just []
209 ------------------------------------------------------------------------
210 type CSVPostAPI = Summary "Update List (legacy v3 CSV)"
215 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
217 csvPostAsync :: ListId -> GargServer PostAPI
220 JobFunction $ \f@(WithFile ft _ n) log' -> do
221 printDebug "[csvPostAsync] filetype" ft
222 printDebug "[csvPostAsync] name" n
223 csvPostAsync' lId f (liftBase . log')
225 csvPostAsync' :: FlowCmdM env err m
230 csvPostAsync' l (WithFile _ m _) logStatus = do
231 logStatus JobLog { _scst_succeeded = Just 0
232 , _scst_failed = Just 0
233 , _scst_remaining = Just 1
234 , _scst_events = Just []
238 pure JobLog { _scst_succeeded = Just 1
239 , _scst_failed = Just 0
240 , _scst_remaining = Just 0
241 , _scst_events = Just []
243 ------------------------------------------------------------------------
245 data WithFile = WithFile
246 { _wf_filetype :: !FileType
247 , _wf_data :: !NgramsList
249 } deriving (Eq, Show, Generic)
251 makeLenses ''WithFile
252 instance FromForm WithFile
253 instance FromJSON WithFile where
254 parseJSON = genericParseJSON $ jsonOptions "_wf_"
255 instance ToJSON WithFile where
256 toJSON = genericToJSON $ jsonOptions "_wf_"
257 instance ToSchema WithFile where
258 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wf_")