2 Module : Gargantext.API.Ngrams.List
3 Description : Get Ngrams (lists)
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE TemplateHaskell #-}
13 {-# LANGUAGE TypeOperators #-}
15 module Gargantext.API.Ngrams.List
18 import Control.Lens hiding (elements, Indexed)
19 import Data.Either (Either(..))
20 import Data.HashMap.Strict (HashMap)
21 import Data.Map (Map, toList)
22 import Data.Maybe (catMaybes, fromMaybe)
24 import Data.Text (Text, concat, pack, splitOn)
25 import Data.Vector (Vector)
26 import Gargantext.API.Admin.Orchestrator.Types
27 import Gargantext.API.Ngrams (setListNgrams)
28 import Gargantext.API.Ngrams.List.Types
29 import Gargantext.API.Ngrams.Prelude (getNgramsList)
30 import Gargantext.API.Ngrams.Tools (getTermsWith)
31 import Gargantext.API.Ngrams.Types
32 import Gargantext.API.Prelude (GargServer)
33 import Gargantext.API.Types
34 import Gargantext.Core.NodeStory
35 import Gargantext.Core.Text.Terms (ExtractedNgrams(..))
36 import Gargantext.Core.Text.Terms.WithList (buildPatterns, termsInText)
37 import Gargantext.Core.Types.Main (ListType(..))
38 import Gargantext.Database.Action.Flow (saveDocNgramsWith)
39 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
40 import Gargantext.Database.Admin.Types.Hyperdata.Document
41 import Gargantext.Database.Admin.Types.Node
42 import Gargantext.Database.Query.Table.Node (getNode)
43 import Gargantext.Database.Query.Table.NodeContext (selectDocNodes)
44 import Gargantext.Database.Schema.Context
45 import Gargantext.Database.Schema.Ngrams
46 import Gargantext.Database.Schema.Node (_node_parent_id)
47 import Gargantext.Database.Types (Indexed(..))
48 import Gargantext.Prelude
50 import Servant.Job.Async
51 import qualified Data.ByteString.Lazy as BSL
52 import qualified Data.Csv as Csv
53 import qualified Data.HashMap.Strict as HashMap
54 import qualified Data.List as List
55 import qualified Data.Map as Map
56 import qualified Data.Set as Set
57 import qualified Data.Text as Text
58 import qualified Data.Vector as Vec
59 import qualified Prelude
60 import qualified Protolude as P
61 ------------------------------------------------------------------------
62 type GETAPI = Summary "Get List"
64 :> Capture "listId" ListId
65 :> Get '[JSON, HTML] (Headers '[Header "Content-Disposition" Text] NgramsList)
66 getApi :: GargServer GETAPI
69 ----------------------
70 type JSONAPI = Summary "Update List"
72 :> Capture "listId" ListId
76 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
78 jsonApi :: GargServer JSONAPI
81 ----------------------
82 type CSVAPI = Summary "Update List (legacy v3 CSV)"
84 :> Capture "listId" ListId
89 :> AsyncJobs JobLog '[FormUrlEncoded] WithTextFile JobLog
91 csvApi :: GargServer CSVAPI
94 ------------------------------------------------------------------------
95 get :: HasNodeStory env err m =>
96 ListId -> m (Headers '[Header "Content-Disposition" Text] NgramsList)
98 lst <- getNgramsList lId
99 let (NodeId id') = lId
100 return $ addHeader (concat [ "attachment; filename=GarganText_NgramsList-"
106 ------------------------------------------------------------------------
109 setList :: FlowCmdM env err m
114 -- TODO check with Version for optim
115 printDebug "New list as file" l
116 _ <- mapM (\(nt, Versioned _v ns) -> setListNgrams l nt ns) $ toList m
120 ------------------------------------------------------------------------
121 -- | Re-index documents of a corpus with new ngrams (called orphans here)
122 reIndexWith :: ( HasNodeStory env err m
130 reIndexWith cId lId nt lts = do
131 -- Getting [NgramsTerm]
133 <$> map (\(k,vs) -> k:vs)
135 <$> getTermsWith identity [lId] nt lts
137 -- printDebug "ts" ts
139 -- Taking the ngrams with 0 occurrences only (orphans)
140 -- occs <- getOccByNgramsOnlyFast' cId lId nt ts
142 -- printDebug "occs" occs
144 let orphans = ts {- List.concat
145 $ map (\t -> case HashMap.lookup t occs of
147 Just n -> if n <= 1 then [t] else [ ]
150 -- printDebug "orphans" orphans
152 -- Get all documents of the corpus
153 docs <- selectDocNodes cId
154 -- printDebug "docs length" (List.length docs)
156 -- Checking Text documents where orphans match
159 ngramsByDoc = map (HashMap.fromList)
160 $ map (map (\(k,v) -> (SimpleNgrams (text2ngrams k), v)))
161 $ map (\doc -> List.zip
162 (termsInText (buildPatterns $ map (\k -> (Text.splitOn " " $ unNgramsTerm k, [])) orphans)
163 $ Text.unlines $ catMaybes
164 [ doc ^. context_hyperdata . hd_title
165 , doc ^. context_hyperdata . hd_abstract
168 (List.cycle [Map.fromList $ [(nt, Map.singleton (doc ^. context_id) 1 )]])
171 -- printDebug "ngramsByDoc" ngramsByDoc
173 -- Saving the indexation in database
174 _ <- mapM (saveDocNgramsWith lId) ngramsByDoc
176 pure () -- ngramsByDoc
178 toIndexedNgrams :: HashMap Text NgramsId -> Text -> Maybe (Indexed Int Ngrams)
179 toIndexedNgrams m t = Indexed <$> i <*> n
181 i = HashMap.lookup t m
182 n = Just (text2ngrams t)
184 ------------------------------------------------------------------------
185 type PostAPI = Summary "Update List"
189 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
191 postAsync :: GargServer JSONAPI
194 JobFunction (\f log' ->
197 -- printDebug "postAsync ListId" x
199 in postAsync' lId f log'')
201 postAsync' :: FlowCmdM env err m
206 postAsync' l (WithFile _ m _) logStatus = do
208 logStatus JobLog { _scst_succeeded = Just 0
209 , _scst_failed = Just 0
210 , _scst_remaining = Just 2
211 , _scst_events = Just []
213 printDebug "New list as file" l
215 -- printDebug "Done" r
217 logStatus JobLog { _scst_succeeded = Just 1
218 , _scst_failed = Just 0
219 , _scst_remaining = Just 1
220 , _scst_events = Just []
224 corpus_node <- getNode l -- (Proxy :: Proxy HyperdataList)
225 let corpus_id = fromMaybe (panic "") (_node_parent_id corpus_node)
226 _ <- reIndexWith corpus_id l NgramsTerms (Set.fromList [MapTerm, CandidateTerm])
228 pure JobLog { _scst_succeeded = Just 2
229 , _scst_failed = Just 0
230 , _scst_remaining = Just 0
231 , _scst_events = Just []
235 ------------------------------------------------------------------------
236 type CSVPostAPI = Summary "Update List (legacy v3 CSV)"
241 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
243 readCsvText :: Text -> [(Text, Text, Text)]
244 readCsvText t = case eDec of
246 Right dec -> Vec.toList dec
248 lt = BSL.fromStrict $ P.encodeUtf8 t
249 eDec = Csv.decodeWith
250 (Csv.defaultDecodeOptions { Csv.decDelimiter = fromIntegral (P.ord '\t') })
251 Csv.HasHeader lt :: Either Prelude.String (Vector (Text, Text, Text))
253 parseCsvData :: [(Text, Text, Text)] -> Map NgramsTerm NgramsRepoElement
254 parseCsvData lst = Map.fromList $ conv <$> lst
256 conv (status, label, forms) =
257 (NgramsTerm label, NgramsRepoElement { _nre_size = 1
258 , _nre_list = case status == "map" of
260 False -> case status == "main" of
261 True -> CandidateTerm
263 , _nre_root = Nothing
264 , _nre_parent = Nothing
265 , _nre_children = MSet
267 $ map (\form -> (NgramsTerm form, ()))
269 $ splitOn "|&|" forms
273 csvPost :: FlowCmdM env err m
278 printDebug "[csvPost] l" l
279 -- printDebug "[csvPost] m" m
280 -- status label forms
281 let lst = readCsvText m
282 let p = parseCsvData lst
283 --printDebug "[csvPost] lst" lst
284 printDebug "[csvPost] p" p
285 _ <- setListNgrams l NgramsTerms p
286 printDebug "ReIndexing List" l
287 corpus_node <- getNode l -- (Proxy :: Proxy HyperdataList)
288 let corpus_id = fromMaybe (panic "") (_node_parent_id corpus_node)
289 _ <- reIndexWith corpus_id l NgramsTerms (Set.fromList [MapTerm, CandidateTerm])
293 ------------------------------------------------------------------------
294 csvPostAsync :: GargServer CSVAPI
297 JobFunction $ \f@(WithTextFile ft _ n) log' -> do
299 printDebug "[csvPostAsync] filetype" ft
300 printDebug "[csvPostAsync] name" n
302 csvPostAsync' lId f log''
305 csvPostAsync' :: FlowCmdM env err m
310 csvPostAsync' l (WithTextFile _ m _) logStatus = do
311 logStatus JobLog { _scst_succeeded = Just 0
312 , _scst_failed = Just 0
313 , _scst_remaining = Just 1
314 , _scst_events = Just []
318 pure JobLog { _scst_succeeded = Just 1
319 , _scst_failed = Just 0
320 , _scst_remaining = Just 0
321 , _scst_events = Just []
323 ------------------------------------------------------------------------