2 Module : Gargantext.API.Ngrams.List
3 Description : Get Ngrams (lists)
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE TemplateHaskell #-}
13 {-# LANGUAGE TypeOperators #-}
15 module Gargantext.API.Ngrams.List
18 import Control.Lens hiding (elements, Indexed)
20 import Data.Either (Either(..))
21 import Data.HashMap.Strict (HashMap)
22 import Data.Map (Map, toList)
23 import Data.Maybe (catMaybes)
25 import Data.Text (Text, concat, pack, splitOn)
26 import Data.Vector (Vector)
27 import Gargantext.API.Admin.Orchestrator.Types
28 import Gargantext.API.Ngrams (setListNgrams)
29 import Gargantext.API.Ngrams.List.Types
30 import Gargantext.API.Ngrams.Prelude (getNgramsList)
31 import Gargantext.API.Ngrams.Tools (getTermsWith)
32 import Gargantext.API.Ngrams.Types
33 import Gargantext.API.Prelude (GargServer)
34 import Gargantext.Core.NodeStory
35 import Gargantext.Core.Text.Terms (ExtractedNgrams(..))
36 import Gargantext.Core.Text.Terms.WithList (buildPatterns, termsInText)
37 import Gargantext.Core.Types.Main (ListType(..))
38 import Gargantext.Database.Action.Flow (saveDocNgramsWith)
39 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
40 import Gargantext.Database.Action.Metrics.NgramsByContext (getOccByNgramsOnlyFast')
41 import Gargantext.Database.Admin.Types.Hyperdata.Document
42 import Gargantext.Database.Admin.Types.Node
43 import Gargantext.Database.Query.Table.NodeContext (selectDocNodes)
44 import Gargantext.Database.Schema.Context
45 import Gargantext.Database.Schema.Ngrams
46 import Gargantext.Database.Types (Indexed(..))
47 import Gargantext.Prelude
48 import Network.HTTP.Media ((//), (/:))
50 import Servant.Job.Async
51 import qualified Data.ByteString.Lazy as BSL
52 import qualified Data.Csv as Csv
53 import qualified Data.HashMap.Strict as HashMap
54 import qualified Data.List as List
55 import qualified Data.Map as Map
56 import qualified Data.Text as Text
57 import qualified Data.Vector as Vec
58 import qualified Prelude as Prelude
59 import qualified Protolude as P
60 ------------------------------------------------------------------------
61 type GETAPI = Summary "Get List"
63 :> Capture "listId" ListId
64 :> Get '[JSON, HTML] (Headers '[Header "Content-Disposition" Text] NgramsList)
65 getApi :: GargServer GETAPI
69 instance Accept HTML where
70 contentType _ = "text" // "html" /: ("charset", "utf-8")
71 instance ToJSON a => MimeRender HTML a where
74 ----------------------
75 type JSONAPI = Summary "Update List"
77 :> Capture "listId" ListId
81 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
83 jsonApi :: GargServer JSONAPI
86 ----------------------
87 type CSVAPI = Summary "Update List (legacy v3 CSV)"
89 :> Capture "listId" ListId
94 :> AsyncJobs JobLog '[FormUrlEncoded] WithTextFile JobLog
96 csvApi :: GargServer CSVAPI
99 ------------------------------------------------------------------------
100 get :: HasNodeStory env err m =>
101 ListId -> m (Headers '[Header "Content-Disposition" Text] NgramsList)
103 lst <- getNgramsList lId
104 let (NodeId id') = lId
105 return $ addHeader (concat [ "attachment; filename=GarganText_NgramsList-"
111 ------------------------------------------------------------------------
114 setList :: FlowCmdM env err m
119 -- TODO check with Version for optim
120 printDebug "New list as file" l
121 _ <- mapM (\(nt, Versioned _v ns) -> setListNgrams l nt ns) $ toList m
125 ------------------------------------------------------------------------
126 -- | Re-index documents of a corpus with new ngrams (called orphans here)
127 reIndexWith :: ( HasNodeStory env err m
135 reIndexWith cId lId nt lts = do
136 -- Getting [NgramsTerm]
138 <$> map (\(k,vs) -> k:vs)
140 <$> getTermsWith identity [lId] nt lts
144 -- Taking the ngrams with 0 occurrences only (orphans)
145 occs <- getOccByNgramsOnlyFast' cId lId nt ts
147 printDebug "occs" occs
149 let orphans = List.concat
150 $ map (\t -> case HashMap.lookup t occs of
152 Just n -> if n <= 1 then [t] else [ ]
155 printDebug "orphans" orphans
157 -- Get all documents of the corpus
158 docs <- selectDocNodes cId
159 printDebug "docs length" (List.length docs)
161 -- Checking Text documents where orphans match
164 ngramsByDoc = map (HashMap.fromList)
165 $ map (map (\(k,v) -> (SimpleNgrams (text2ngrams k), v)))
166 $ map (\doc -> List.zip
167 (termsInText (buildPatterns $ map (\k -> (Text.splitOn " " $ unNgramsTerm k, [])) orphans)
168 $ Text.unlines $ catMaybes
169 [ doc ^. context_hyperdata . hd_title
170 , doc ^. context_hyperdata . hd_abstract
173 (List.cycle [Map.fromList $ [(nt, Map.singleton (doc ^. context_id) 1 )]])
176 printDebug "ngramsByDoc" ngramsByDoc
178 -- Saving the indexation in database
179 _ <- mapM (saveDocNgramsWith lId) ngramsByDoc
181 pure () -- ngramsByDoc
183 toIndexedNgrams :: HashMap Text NgramsId -> Text -> Maybe (Indexed Int Ngrams)
184 toIndexedNgrams m t = Indexed <$> i <*> n
186 i = HashMap.lookup t m
187 n = Just (text2ngrams t)
189 ------------------------------------------------------------------------
190 type PostAPI = Summary "Update List"
194 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
196 postAsync :: GargServer JSONAPI
199 JobFunction (\f log' ->
202 printDebug "postAsync ListId" x
204 in postAsync' lId f log'')
206 postAsync' :: FlowCmdM env err m
211 postAsync' l (WithFile _ m _) logStatus = do
213 logStatus JobLog { _scst_succeeded = Just 0
214 , _scst_failed = Just 0
215 , _scst_remaining = Just 1
216 , _scst_events = Just []
218 printDebug "New list as file" l
220 -- printDebug "Done" r
222 pure JobLog { _scst_succeeded = Just 1
223 , _scst_failed = Just 0
224 , _scst_remaining = Just 0
225 , _scst_events = Just []
227 ------------------------------------------------------------------------
228 type CSVPostAPI = Summary "Update List (legacy v3 CSV)"
233 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
235 readCsvText :: Text -> [(Text, Text, Text)]
236 readCsvText t = case eDec of
238 Right dec -> Vec.toList dec
240 lt = BSL.fromStrict $ P.encodeUtf8 t
241 eDec = Csv.decodeWith
242 (Csv.defaultDecodeOptions { Csv.decDelimiter = fromIntegral (P.ord '\t') })
243 Csv.HasHeader lt :: Either Prelude.String (Vector (Text, Text, Text))
245 parseCsvData :: [(Text, Text, Text)] -> Map NgramsTerm NgramsRepoElement
246 parseCsvData lst = Map.fromList $ conv <$> lst
248 conv (status, label, forms) =
249 (NgramsTerm label, NgramsRepoElement { _nre_size = 1
250 , _nre_list = case status == "map" of
252 False -> case status == "main" of
253 True -> CandidateTerm
255 , _nre_root = Nothing
256 , _nre_parent = Nothing
257 , _nre_children = MSet
259 $ map (\form -> (NgramsTerm form, ()))
261 $ splitOn "|&|" forms
265 csvPost :: FlowCmdM env err m
270 printDebug "[csvPost] l" l
271 -- printDebug "[csvPost] m" m
272 -- status label forms
273 let lst = readCsvText m
274 let p = parseCsvData lst
275 --printDebug "[csvPost] lst" lst
276 printDebug "[csvPost] p" p
277 _ <- setListNgrams l NgramsTerms p
280 ------------------------------------------------------------------------
281 csvPostAsync :: GargServer CSVAPI
284 JobFunction $ \f@(WithTextFile ft _ n) log' -> do
286 printDebug "[csvPostAsync] filetype" ft
287 printDebug "[csvPostAsync] name" n
289 csvPostAsync' lId f log''
292 csvPostAsync' :: FlowCmdM env err m
297 csvPostAsync' l (WithTextFile _ m _) logStatus = do
298 logStatus JobLog { _scst_succeeded = Just 0
299 , _scst_failed = Just 0
300 , _scst_remaining = Just 1
301 , _scst_events = Just []
305 pure JobLog { _scst_succeeded = Just 1
306 , _scst_failed = Just 0
307 , _scst_remaining = Just 0
308 , _scst_events = Just []
310 ------------------------------------------------------------------------