2 Module : Gargantext.API.Ngrams.List
3 Description : Get Ngrams (lists)
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE TemplateHaskell #-}
13 {-# LANGUAGE TypeOperators #-}
15 module Gargantext.API.Ngrams.List
18 import Control.Lens hiding (elements, Indexed)
20 import Data.Either (Either(..))
21 import Data.HashMap.Strict (HashMap)
22 import Data.Map (Map, toList)
23 import Data.Maybe (catMaybes, fromMaybe)
25 import Data.Text (Text, concat, pack, splitOn)
26 import Data.Vector (Vector)
27 import Gargantext.API.Admin.Orchestrator.Types
28 import Gargantext.API.Ngrams (setListNgrams)
29 import Gargantext.API.Ngrams.List.Types
30 import Gargantext.API.Ngrams.Prelude (getNgramsList)
31 import Gargantext.API.Ngrams.Tools (getTermsWith)
32 import Gargantext.API.Ngrams.Types
33 import Gargantext.API.Prelude (GargServer)
34 import Gargantext.Core.NodeStory
35 import Gargantext.Core.Text.Terms (ExtractedNgrams(..))
36 import Gargantext.Core.Text.Terms.WithList (buildPatterns, termsInText)
37 import Gargantext.Core.Types.Main (ListType(..))
38 import Gargantext.Database.Action.Flow (saveDocNgramsWith)
39 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
40 import Gargantext.Database.Admin.Types.Hyperdata.Document
41 import Gargantext.Database.Admin.Types.Node
42 import Gargantext.Database.Query.Table.Node (getNode)
43 import Gargantext.Database.Query.Table.NodeContext (selectDocNodes)
44 import Gargantext.Database.Schema.Context
45 import Gargantext.Database.Schema.Ngrams
46 import Gargantext.Database.Schema.Node (_node_parent_id)
47 import Gargantext.Database.Types (Indexed(..))
48 import Gargantext.Prelude
49 import Network.HTTP.Media ((//), (/:))
51 import Servant.Job.Async
52 import qualified Data.ByteString.Lazy as BSL
53 import qualified Data.Csv as Csv
54 import qualified Data.HashMap.Strict as HashMap
55 import qualified Data.List as List
56 import qualified Data.Map as Map
57 import qualified Data.Set as Set
58 import qualified Data.Text as Text
59 import qualified Data.Vector as Vec
60 import qualified Prelude
61 import qualified Protolude as P
62 ------------------------------------------------------------------------
63 type GETAPI = Summary "Get List"
65 :> Capture "listId" ListId
66 :> Get '[JSON, HTML] (Headers '[Header "Content-Disposition" Text] NgramsList)
67 getApi :: GargServer GETAPI
71 instance Accept HTML where
72 contentType _ = "text" // "html" /: ("charset", "utf-8")
73 instance ToJSON a => MimeRender HTML a where
76 ----------------------
77 type JSONAPI = Summary "Update List"
79 :> Capture "listId" ListId
83 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
85 jsonApi :: GargServer JSONAPI
88 ----------------------
89 type CSVAPI = Summary "Update List (legacy v3 CSV)"
91 :> Capture "listId" ListId
96 :> AsyncJobs JobLog '[FormUrlEncoded] WithTextFile JobLog
98 csvApi :: GargServer CSVAPI
101 ------------------------------------------------------------------------
102 get :: HasNodeStory env err m =>
103 ListId -> m (Headers '[Header "Content-Disposition" Text] NgramsList)
105 lst <- getNgramsList lId
106 let (NodeId id') = lId
107 return $ addHeader (concat [ "attachment; filename=GarganText_NgramsList-"
113 ------------------------------------------------------------------------
116 setList :: FlowCmdM env err m
121 -- TODO check with Version for optim
122 printDebug "New list as file" l
123 _ <- mapM (\(nt, Versioned _v ns) -> setListNgrams l nt ns) $ toList m
127 ------------------------------------------------------------------------
128 -- | Re-index documents of a corpus with new ngrams (called orphans here)
129 reIndexWith :: ( HasNodeStory env err m
137 reIndexWith cId lId nt lts = do
138 -- Getting [NgramsTerm]
140 <$> map (\(k,vs) -> k:vs)
142 <$> getTermsWith identity [lId] nt lts
144 -- printDebug "ts" ts
146 -- Taking the ngrams with 0 occurrences only (orphans)
147 -- occs <- getOccByNgramsOnlyFast' cId lId nt ts
149 -- printDebug "occs" occs
151 let orphans = ts {- List.concat
152 $ map (\t -> case HashMap.lookup t occs of
154 Just n -> if n <= 1 then [t] else [ ]
157 -- printDebug "orphans" orphans
159 -- Get all documents of the corpus
160 docs <- selectDocNodes cId
161 -- printDebug "docs length" (List.length docs)
163 -- Checking Text documents where orphans match
166 ngramsByDoc = map (HashMap.fromList)
167 $ map (map (\(k,v) -> (SimpleNgrams (text2ngrams k), v)))
168 $ map (\doc -> List.zip
169 (termsInText (buildPatterns $ map (\k -> (Text.splitOn " " $ unNgramsTerm k, [])) orphans)
170 $ Text.unlines $ catMaybes
171 [ doc ^. context_hyperdata . hd_title
172 , doc ^. context_hyperdata . hd_abstract
175 (List.cycle [Map.fromList $ [(nt, Map.singleton (doc ^. context_id) 1 )]])
178 -- printDebug "ngramsByDoc" ngramsByDoc
180 -- Saving the indexation in database
181 _ <- mapM (saveDocNgramsWith lId) ngramsByDoc
183 pure () -- ngramsByDoc
185 toIndexedNgrams :: HashMap Text NgramsId -> Text -> Maybe (Indexed Int Ngrams)
186 toIndexedNgrams m t = Indexed <$> i <*> n
188 i = HashMap.lookup t m
189 n = Just (text2ngrams t)
191 ------------------------------------------------------------------------
192 type PostAPI = Summary "Update List"
196 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
198 postAsync :: GargServer JSONAPI
201 JobFunction (\f log' ->
204 -- printDebug "postAsync ListId" x
206 in postAsync' lId f log'')
208 postAsync' :: FlowCmdM env err m
213 postAsync' l (WithFile _ m _) logStatus = do
215 logStatus JobLog { _scst_succeeded = Just 0
216 , _scst_failed = Just 0
217 , _scst_remaining = Just 2
218 , _scst_events = Just []
220 printDebug "New list as file" l
222 -- printDebug "Done" r
224 logStatus JobLog { _scst_succeeded = Just 1
225 , _scst_failed = Just 0
226 , _scst_remaining = Just 1
227 , _scst_events = Just []
231 corpus_node <- getNode l -- (Proxy :: Proxy HyperdataList)
232 let corpus_id = fromMaybe (panic "") (_node_parent_id corpus_node)
233 _ <- reIndexWith corpus_id l NgramsTerms (Set.fromList [MapTerm, CandidateTerm])
235 pure JobLog { _scst_succeeded = Just 2
236 , _scst_failed = Just 0
237 , _scst_remaining = Just 0
238 , _scst_events = Just []
242 ------------------------------------------------------------------------
243 type CSVPostAPI = Summary "Update List (legacy v3 CSV)"
248 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
250 readCsvText :: Text -> [(Text, Text, Text)]
251 readCsvText t = case eDec of
253 Right dec -> Vec.toList dec
255 lt = BSL.fromStrict $ P.encodeUtf8 t
256 eDec = Csv.decodeWith
257 (Csv.defaultDecodeOptions { Csv.decDelimiter = fromIntegral (P.ord '\t') })
258 Csv.HasHeader lt :: Either Prelude.String (Vector (Text, Text, Text))
260 parseCsvData :: [(Text, Text, Text)] -> Map NgramsTerm NgramsRepoElement
261 parseCsvData lst = Map.fromList $ conv <$> lst
263 conv (status, label, forms) =
264 (NgramsTerm label, NgramsRepoElement { _nre_size = 1
265 , _nre_list = case status == "map" of
267 False -> case status == "main" of
268 True -> CandidateTerm
270 , _nre_root = Nothing
271 , _nre_parent = Nothing
272 , _nre_children = MSet
274 $ map (\form -> (NgramsTerm form, ()))
276 $ splitOn "|&|" forms
280 csvPost :: FlowCmdM env err m
285 printDebug "[csvPost] l" l
286 -- printDebug "[csvPost] m" m
287 -- status label forms
288 let lst = readCsvText m
289 let p = parseCsvData lst
290 --printDebug "[csvPost] lst" lst
291 printDebug "[csvPost] p" p
292 _ <- setListNgrams l NgramsTerms p
293 printDebug "ReIndexing List" l
294 corpus_node <- getNode l -- (Proxy :: Proxy HyperdataList)
295 let corpus_id = fromMaybe (panic "") (_node_parent_id corpus_node)
296 _ <- reIndexWith corpus_id l NgramsTerms (Set.fromList [MapTerm, CandidateTerm])
300 ------------------------------------------------------------------------
301 csvPostAsync :: GargServer CSVAPI
304 JobFunction $ \f@(WithTextFile ft _ n) log' -> do
306 printDebug "[csvPostAsync] filetype" ft
307 printDebug "[csvPostAsync] name" n
309 csvPostAsync' lId f log''
312 csvPostAsync' :: FlowCmdM env err m
317 csvPostAsync' l (WithTextFile _ m _) logStatus = do
318 logStatus JobLog { _scst_succeeded = Just 0
319 , _scst_failed = Just 0
320 , _scst_remaining = Just 1
321 , _scst_events = Just []
325 pure JobLog { _scst_succeeded = Just 1
326 , _scst_failed = Just 0
327 , _scst_remaining = Just 0
328 , _scst_events = Just []
330 ------------------------------------------------------------------------