2 Module : Gargantext.API.Ngrams.List
3 Description : Get Ngrams (lists)
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE TemplateHaskell #-}
13 {-# LANGUAGE TypeOperators #-}
15 module Gargantext.API.Ngrams.List
18 import Control.Lens hiding (elements, Indexed)
20 import Data.HashMap.Strict (HashMap)
21 import Data.Map (toList, fromList)
22 import Data.Maybe (catMaybes)
24 import Data.Swagger (ToSchema, declareNamedSchema, genericDeclareNamedSchema)
25 import Data.Text (Text, concat, pack)
26 import GHC.Generics (Generic)
27 import Gargantext.API.Admin.Orchestrator.Types
28 import Gargantext.API.Ngrams (getNgramsTableMap, setListNgrams)
29 import Gargantext.API.Ngrams.Tools (getTermsWith)
30 import Gargantext.API.Ngrams.Types
31 import Gargantext.API.Node.Corpus.New.File (FileType(..))
32 import Gargantext.API.Prelude (GargServer)
33 import Gargantext.Core.NodeStory
34 import Gargantext.Core.Text.Terms (ExtractedNgrams(..))
35 import Gargantext.Core.Text.Terms.WithList (buildPatterns, termsInText)
36 import Gargantext.Core.Types.Main (ListType(..))
37 import Gargantext.Core.Utils.Prefix (unPrefixSwagger)
38 import Gargantext.Database.Action.Flow (saveDocNgramsWith)
39 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
40 import Gargantext.Database.Action.Metrics.NgramsByNode (getOccByNgramsOnlyFast')
41 import Gargantext.Database.Admin.Types.Hyperdata.Document
42 import Gargantext.Database.Admin.Types.Node
43 import Gargantext.Database.Query.Table.NodeNode (selectDocNodes)
44 import Gargantext.Database.Schema.Ngrams
45 import Gargantext.Database.Schema.Node
46 import Gargantext.Database.Types (Indexed(..))
47 import Gargantext.Prelude
48 import Network.HTTP.Media ((//), (/:))
50 import Servant.Job.Async
51 import Servant.Job.Utils (jsonOptions)
52 import Web.FormUrlEncoded (FromForm)
53 import qualified Data.HashMap.Strict as HashMap
54 import qualified Data.List as List
55 import qualified Data.Map as Map
56 import qualified Data.Text as Text
58 ------------------------------------------------------------------------
60 type API = Get '[JSON, HTML] (Headers '[Header "Content-Disposition" Text] NgramsList)
61 -- :<|> ReqBody '[JSON] NgramsList :> Post '[JSON] Bool
66 instance Accept HTML where
67 contentType _ = "text" // "html" /: ("charset", "utf-8")
68 instance ToJSON a => MimeRender HTML a where
72 api :: ListId -> GargServer API
73 api l = get l :<|> postAsync l :<|> csvPostAsync l
75 ----------------------
76 type GETAPI = Summary "Get List"
78 :> Capture "listId" ListId
79 :> Get '[JSON, HTML] (Headers '[Header "Content-Disposition" Text] NgramsList)
80 getApi :: GargServer GETAPI
83 ----------------------
84 type JSONAPI = Summary "Update List"
86 :> Capture "listId" ListId
90 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
92 jsonApi :: GargServer JSONAPI
95 ----------------------
96 type CSVAPI = Summary "Update List (legacy v3 CSV)"
98 :> Capture "listId" ListId
103 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
105 csvApi :: GargServer CSVAPI
106 csvApi = csvPostAsync
108 ----------------------
112 ------------------------------------------------------------------------
113 get :: HasNodeStory env err m =>
114 ListId -> m (Headers '[Header "Content-Disposition" Text] NgramsList)
117 let (NodeId id') = lId
118 return $ addHeader (concat [ "attachment; filename=GarganText_NgramsList-"
124 get' :: HasNodeStory env err m
125 => ListId -> m NgramsList
128 <$> mapM (getNgramsTableMap lId) ngramsTypes
130 ------------------------------------------------------------------------
133 post :: FlowCmdM env err m
138 -- TODO check with Version for optim
139 printDebug "New list as file" l
140 _ <- mapM (\(nt, Versioned _v ns) -> setListNgrams l nt ns) $ toList m
144 ------------------------------------------------------------------------
145 csvPost :: FlowCmdM env err m
150 printDebug "[csvPost] l" l
151 printDebug "[csvPost] m" m
154 -----------------------------------------------------------------------------
155 -- | Re-index documents of a corpus with new ngrams (called orphans here)
156 reIndexWith :: ( HasNodeStory env err m
164 reIndexWith cId lId nt lts = do
165 -- Getting [NgramsTerm]
167 <$> map (\(k,vs) -> k:vs)
169 <$> getTermsWith identity [lId] nt lts
171 -- printDebug "ts" ts
173 -- Taking the ngrams with 0 occurrences only (orphans)
174 occs <- getOccByNgramsOnlyFast' cId lId nt ts
176 -- printDebug "occs" occs
178 let orphans = List.concat
179 $ map (\t -> case HashMap.lookup t occs of
181 Just n -> if n <= 1 then [t] else [ ]
184 -- printDebug "orphans" orphans
186 -- Get all documents of the corpus
187 docs <- selectDocNodes cId
188 -- printDebug "docs length" (List.length docs)
190 -- Checking Text documents where orphans match
193 ngramsByDoc = map (HashMap.fromList)
194 $ map (map (\(k,v) -> (SimpleNgrams (text2ngrams k), v)))
195 $ map (\doc -> List.zip
196 (termsInText (buildPatterns $ map (\k -> (Text.splitOn " " $ unNgramsTerm k, [])) orphans)
197 $ Text.unlines $ catMaybes
198 [ doc ^. node_hyperdata . hd_title
199 , doc ^. node_hyperdata . hd_abstract
202 (List.cycle [Map.fromList $ [(nt, Map.singleton (doc ^. node_id) 1 )]])
205 -- printDebug "ngramsByDoc" ngramsByDoc
207 -- Saving the indexation in database
208 _ <- mapM (saveDocNgramsWith lId) ngramsByDoc
210 pure () -- ngramsByDoc
212 toIndexedNgrams :: HashMap Text NgramsId -> Text -> Maybe (Indexed Int Ngrams)
213 toIndexedNgrams m t = Indexed <$> i <*> n
215 i = HashMap.lookup t m
216 n = Just (text2ngrams t)
218 ------------------------------------------------------------------------
219 type PostAPI = Summary "Update List"
223 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
225 postAsync :: GargServer JSONAPI
228 JobFunction (\f log' ->
231 printDebug "postAsync ListId" x
233 in postAsync' lId f log'')
235 postAsync' :: FlowCmdM env err m
240 postAsync' l (WithFile _ m _) logStatus = do
242 logStatus JobLog { _scst_succeeded = Just 0
243 , _scst_failed = Just 0
244 , _scst_remaining = Just 1
245 , _scst_events = Just []
247 printDebug "New list as file" l
249 -- printDebug "Done" r
251 pure JobLog { _scst_succeeded = Just 1
252 , _scst_failed = Just 0
253 , _scst_remaining = Just 0
254 , _scst_events = Just []
256 ------------------------------------------------------------------------
257 type CSVPostAPI = Summary "Update List (legacy v3 CSV)"
262 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
264 csvPostAsync :: GargServer CSVAPI
267 JobFunction $ \f@(WithFile ft _ n) log' -> do
268 printDebug "[csvPostAsync] filetype" ft
269 printDebug "[csvPostAsync] name" n
270 csvPostAsync' lId f (liftBase . log')
272 csvPostAsync' :: FlowCmdM env err m
277 csvPostAsync' l (WithFile _ m _) logStatus = do
278 logStatus JobLog { _scst_succeeded = Just 0
279 , _scst_failed = Just 0
280 , _scst_remaining = Just 1
281 , _scst_events = Just []
285 pure JobLog { _scst_succeeded = Just 1
286 , _scst_failed = Just 0
287 , _scst_remaining = Just 0
288 , _scst_events = Just []
290 ------------------------------------------------------------------------
292 data WithFile = WithFile
293 { _wf_filetype :: !FileType
294 , _wf_data :: !NgramsList
296 } deriving (Eq, Show, Generic)
298 makeLenses ''WithFile
299 instance FromForm WithFile
300 instance FromJSON WithFile where
301 parseJSON = genericParseJSON $ jsonOptions "_wf_"
302 instance ToJSON WithFile where
303 toJSON = genericToJSON $ jsonOptions "_wf_"
304 instance ToSchema WithFile where
305 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wf_")