2 Module : Gargantext.API.Ngrams.List
3 Description : Get Ngrams (lists)
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE TemplateHaskell #-}
13 {-# LANGUAGE TypeOperators #-}
15 module Gargantext.API.Ngrams.List
18 import Control.Lens hiding (elements, Indexed)
19 import Data.Either (Either(..))
20 import Data.HashMap.Strict (HashMap)
21 import Data.Map.Strict (Map, toList)
22 import Data.Maybe (catMaybes, fromMaybe)
24 import Data.Text (Text, concat, pack, splitOn)
25 import Data.Vector (Vector)
26 import Gargantext.API.Admin.EnvTypes (Env, GargJob(..))
27 import Gargantext.API.Admin.Orchestrator.Types
28 import Gargantext.API.Ngrams (setListNgrams)
29 import Gargantext.API.Ngrams.List.Types
30 import Gargantext.API.Ngrams.Prelude (getNgramsList)
31 import Gargantext.API.Ngrams.Tools (getTermsWith)
32 import Gargantext.API.Ngrams.Types
33 import Gargantext.API.Prelude (GargServer, GargM, GargError)
34 import Gargantext.API.Types
35 import Gargantext.Core.NodeStory
36 import Gargantext.Core.Text.Terms (ExtractedNgrams(..))
37 import Gargantext.Core.Text.Terms.WithList (buildPatterns, termsInText)
38 import Gargantext.Core.Types.Main (ListType(..))
39 import Gargantext.Database.Action.Flow (saveDocNgramsWith)
40 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
41 -- import Gargantext.Database.Action.Metrics.NgramsByContext (refreshNgramsMaterialized)
42 import Gargantext.Database.Admin.Types.Hyperdata.Document
43 import Gargantext.Database.Admin.Types.Node
44 import Gargantext.Database.Query.Table.Node (getNode)
45 import Gargantext.Database.Query.Table.NodeContext (selectDocNodes)
46 import Gargantext.Database.Schema.Context
47 import Gargantext.Database.Schema.Ngrams
48 import Gargantext.Database.Schema.Node (_node_parent_id)
49 import Gargantext.Database.Types (Indexed(..))
50 import Gargantext.Prelude
51 import Gargantext.Utils.Jobs (serveJobsAPI, MonadJobStatus(..))
53 import qualified Data.ByteString.Lazy as BSL
54 import qualified Data.Csv as Csv
55 import qualified Data.HashMap.Strict as HashMap
56 import qualified Data.List as List
57 import qualified Data.Map.Strict as Map
58 import qualified Data.Set as Set
59 import qualified Data.Text as Text
60 import qualified Data.Vector as Vec
61 import qualified Gargantext.Database.Query.Table.Ngrams as TableNgrams
62 import qualified Gargantext.Utils.Servant as GUS
63 import qualified Prelude
64 import qualified Protolude as P
65 ------------------------------------------------------------------------
66 type GETAPI = Summary "Get List"
68 :> Capture "listId" ListId
70 :> Get '[JSON, HTML] (Headers '[Header "Content-Disposition" Text] NgramsList)
72 :> Capture "listId" ListId
74 :> Get '[GUS.CSV] (Headers '[Header "Content-Disposition" Text] NgramsTableMap)
75 getApi :: GargServer GETAPI
76 getApi = getJson :<|> getCsv
78 ----------------------
79 type JSONAPI = Summary "Update List"
81 :> Capture "listId" ListId
85 :> AsyncJobs JobLog '[FormUrlEncoded] WithJsonFile JobLog
87 jsonApi :: ServerT JSONAPI (GargM Env GargError)
88 jsonApi = jsonPostAsync
90 ----------------------
91 type CSVAPI = Summary "Update List (legacy v3 CSV)"
93 :> Capture "listId" ListId
98 :> AsyncJobs JobLog '[FormUrlEncoded] WithTextFile JobLog
100 csvApi :: ServerT CSVAPI (GargM Env GargError)
101 csvApi = csvPostAsync
103 ------------------------------------------------------------------------
104 getJson :: HasNodeStory env err m =>
105 ListId -> m (Headers '[Header "Content-Disposition" Text] NgramsList)
107 lst <- getNgramsList lId
108 let (NodeId id') = lId
109 return $ addHeader (concat [ "attachment; filename=GarganText_NgramsList-"
115 getCsv :: HasNodeStory env err m =>
116 ListId -> m (Headers '[Header "Content-Disposition" Text] NgramsTableMap)
118 lst <- getNgramsList lId
119 let (NodeId id') = lId
120 return $ case Map.lookup TableNgrams.NgramsTerms lst of
121 Nothing -> noHeader Map.empty
122 Just (Versioned { _v_data }) ->
123 addHeader (concat [ "attachment; filename=GarganText_NgramsList-"
129 ------------------------------------------------------------------------
132 setList :: FlowCmdM env err m
137 -- TODO check with Version for optim
138 -- printDebug "New list as file" l
139 _ <- mapM (\(nt, Versioned _v ns) -> setListNgrams l nt ns) $ toList m
143 ------------------------------------------------------------------------
144 -- | Re-index documents of a corpus with new ngrams (called orphans here)
145 reIndexWith :: ( HasNodeStory env err m
153 reIndexWith cId lId nt lts = do
154 -- printDebug "(cId,lId,nt,lts)" (cId, lId, nt, lts)
156 -- Getting [NgramsTerm]
158 <$> map (\(k,vs) -> k:vs)
160 <$> getTermsWith identity [lId] nt lts
162 -- Get all documents of the corpus
163 docs <- selectDocNodes cId
167 ngramsByDoc = map (HashMap.fromListWith (Map.unionWith (Map.unionWith (\(_a,b) (_a',b') -> (1,b+b')))))
168 $ map (map (\((k, cnt), v) -> (SimpleNgrams (text2ngrams k), over (traverse . traverse) (\p -> (p, cnt)) v)))
169 $ map (\doc -> List.zip
170 (termsInText (buildPatterns $ map (\k -> (Text.splitOn " " $ unNgramsTerm k, [])) ts)
171 $ Text.unlines $ catMaybes
172 [ doc ^. context_hyperdata . hd_title
173 , doc ^. context_hyperdata . hd_abstract
176 (List.cycle [Map.fromList $ [(nt, Map.singleton (doc ^. context_id) 1 )]])
179 -- printDebug "ngramsByDoc: " ngramsByDoc
181 -- Saving the indexation in database
182 _ <- mapM (saveDocNgramsWith lId) ngramsByDoc
183 -- _ <- refreshNgramsMaterialized
186 toIndexedNgrams :: HashMap Text NgramsId -> Text -> Maybe (Indexed Int Ngrams)
187 toIndexedNgrams m t = Indexed <$> i <*> n
189 i = HashMap.lookup t m
190 n = Just (text2ngrams t)
192 ------------------------------------------------------------------------
193 jsonPostAsync :: ServerT JSONAPI (GargM Env GargError)
195 serveJobsAPI UpdateNgramsListJobJSON $ \jHandle f ->
196 postAsync' lId f jHandle
198 postAsync' :: (FlowCmdM env err m, MonadJobStatus m)
203 postAsync' l (WithJsonFile m _) jobHandle = do
205 markStarted 2 jobHandle
206 -- printDebug "New list as file" l
208 -- printDebug "Done" r
210 markProgress 1 jobHandle
212 corpus_node <- getNode l -- (Proxy :: Proxy HyperdataList)
213 let corpus_id = fromMaybe (panic "") (_node_parent_id corpus_node)
214 _ <- reIndexWith corpus_id l NgramsTerms (Set.fromList [MapTerm, CandidateTerm])
216 markComplete jobHandle
218 ------------------------------------------------------------------------
220 readCsvText :: Text -> Either Text [(Text, Text, Text)]
221 readCsvText t = case eDec of
222 Left err -> Left $ pack err
223 Right dec -> Right $ Vec.toList dec
225 lt = BSL.fromStrict $ P.encodeUtf8 t
226 eDec = Csv.decodeWith
227 (Csv.defaultDecodeOptions { Csv.decDelimiter = fromIntegral (P.ord '\t') })
228 Csv.HasHeader lt :: Either Prelude.String (Vector (Text, Text, Text))
230 parseCsvData :: [(Text, Text, Text)] -> Map NgramsTerm NgramsRepoElement
231 parseCsvData lst = Map.fromList $ conv <$> lst
233 conv (status, label, forms) =
234 (NgramsTerm label, NgramsRepoElement { _nre_size = 1
235 , _nre_list = case status == "map" of
237 False -> case status == "main" of
238 True -> CandidateTerm
240 , _nre_root = Nothing
241 , _nre_parent = Nothing
242 , _nre_children = MSet
244 $ map (\form -> (NgramsTerm form, ()))
245 $ filter (\w -> w /= "" && w /= label)
246 $ splitOn "|&|" forms
250 csvPost :: FlowCmdM env err m
253 -> m (Either Text ())
255 -- printDebug "[csvPost] l" l
256 -- printDebug "[csvPost] m" m
257 -- status label forms
258 let eLst = readCsvText m
260 Left err -> pure $ Left err
262 let p = parseCsvData lst
263 --printDebug "[csvPost] lst" lst
264 -- printDebug "[csvPost] p" p
265 _ <- setListNgrams l NgramsTerms p
266 -- printDebug "ReIndexing List" l
267 corpus_node <- getNode l -- (Proxy :: Proxy HyperdataList)
268 let corpus_id = fromMaybe (panic "") (_node_parent_id corpus_node)
269 _ <- reIndexWith corpus_id l NgramsTerms (Set.fromList [MapTerm, CandidateTerm])
273 ------------------------------------------------------------------------
274 csvPostAsync :: ServerT CSVAPI (GargM Env GargError)
276 serveJobsAPI UpdateNgramsListJobCSV $ \jHandle f -> do
277 markStarted 1 jHandle
278 ePost <- csvPost lId (_wtf_data f)
280 Left err -> markFailed (Just err) jHandle
281 Right () -> markComplete jHandle
283 getLatestJobStatus jHandle >>= printDebug "[csvPostAsync] job ended with joblog: "
285 ------------------------------------------------------------------------
288 -- | This is for debugging the CSV parser in the REPL
289 importCsvFile :: FlowCmdM env err m
290 => ListId -> P.FilePath -> m (Either Text ())
291 importCsvFile lId fp = do
292 contents <- liftBase $ P.readFile fp