2 Module : Gargantext.API.Ngrams.List
3 Description : Get Ngrams (lists)
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE TemplateHaskell #-}
13 {-# LANGUAGE TypeOperators #-}
15 module Gargantext.API.Ngrams.List
18 import Control.Lens hiding (elements, Indexed)
19 import Data.Either (Either(..))
20 import Data.HashMap.Strict (HashMap)
21 import Data.Map.Strict (Map, toList)
22 import Data.Maybe (catMaybes, fromMaybe)
24 import Data.Text (Text, concat, pack, splitOn)
25 import Data.Vector (Vector)
26 import Gargantext.API.Admin.EnvTypes (Env, GargJob(..))
27 import Gargantext.API.Admin.Orchestrator.Types
28 import Gargantext.API.Ngrams (setListNgrams)
29 import Gargantext.API.Ngrams.List.Types
30 import Gargantext.API.Ngrams.Prelude (getNgramsList)
31 import Gargantext.API.Ngrams.Tools (getTermsWith)
32 import Gargantext.API.Ngrams.Types
33 import Gargantext.API.Prelude (GargServer, GargM, GargError)
34 import Gargantext.API.Types
35 import Gargantext.Core.NodeStory
36 import Gargantext.Core.Text.Terms (ExtractedNgrams(..))
37 import Gargantext.Core.Text.Terms.WithList (MatchedText, buildPatterns, termsInText)
38 import Gargantext.Core.Types (TermsCount)
39 import Gargantext.Core.Types.Main (ListType(..))
40 import Gargantext.Database.Action.Flow (saveDocNgramsWith)
41 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
42 -- import Gargantext.Database.Action.Metrics.NgramsByContext (refreshNgramsMaterialized)
43 import Gargantext.Database.Admin.Types.Hyperdata.Document
44 import Gargantext.Database.Admin.Types.Node
45 import Gargantext.Database.Query.Table.Node (getNode)
46 import Gargantext.Database.Query.Table.NodeContext (selectDocNodes)
47 import Gargantext.Database.Schema.Context
48 import Gargantext.Database.Schema.Ngrams
49 import Gargantext.Database.Schema.Node (_node_parent_id)
50 import Gargantext.Database.Types (Indexed(..))
51 import Gargantext.Prelude
52 import Gargantext.Utils.Jobs (serveJobsAPI, MonadJobStatus(..))
54 import qualified Data.ByteString.Lazy as BSL
55 import qualified Data.Csv as Csv
56 import qualified Data.HashMap.Strict as HashMap
57 import qualified Data.List as List
58 import qualified Data.Map.Strict as Map
59 import qualified Data.Set as Set
60 import qualified Data.Text as Text
61 import qualified Data.Vector as Vec
62 import qualified Gargantext.Database.Query.Table.Ngrams as TableNgrams
63 import qualified Gargantext.Utils.Servant as GUS
64 import qualified Prelude
65 import qualified Protolude as P
66 ------------------------------------------------------------------------
67 type GETAPI = Summary "Get List"
69 :> Capture "listId" ListId
71 :> Get '[JSON, HTML] (Headers '[Header "Content-Disposition" Text] NgramsList)
73 :> Capture "listId" ListId
75 :> Get '[GUS.CSV] (Headers '[Header "Content-Disposition" Text] NgramsTableMap)
76 getApi :: GargServer GETAPI
77 getApi = getJson :<|> getCsv
79 ----------------------
80 type JSONAPI = Summary "Update List"
82 :> Capture "listId" ListId
86 :> AsyncJobs JobLog '[FormUrlEncoded] WithJsonFile JobLog
88 jsonApi :: ServerT JSONAPI (GargM Env GargError)
89 jsonApi = jsonPostAsync
91 ----------------------
92 type CSVAPI = Summary "Update List (legacy v3 CSV)"
94 :> Capture "listId" ListId
99 :> AsyncJobs JobLog '[FormUrlEncoded] WithTextFile JobLog
101 csvApi :: ServerT CSVAPI (GargM Env GargError)
102 csvApi = csvPostAsync
104 ------------------------------------------------------------------------
105 getJson :: HasNodeStory env err m =>
106 ListId -> m (Headers '[Header "Content-Disposition" Text] NgramsList)
108 lst <- getNgramsList lId
109 let (NodeId id') = lId
110 return $ addHeader (concat [ "attachment; filename=GarganText_NgramsList-"
116 getCsv :: HasNodeStory env err m =>
117 ListId -> m (Headers '[Header "Content-Disposition" Text] NgramsTableMap)
119 lst <- getNgramsList lId
120 let (NodeId id') = lId
121 return $ case Map.lookup TableNgrams.NgramsTerms lst of
122 Nothing -> noHeader Map.empty
123 Just (Versioned { _v_data }) ->
124 addHeader (concat [ "attachment; filename=GarganText_NgramsList-"
130 ------------------------------------------------------------------------
133 setList :: FlowCmdM env err m
138 -- TODO check with Version for optim
139 -- printDebug "New list as file" l
140 _ <- mapM (\(nt, Versioned _v ns) -> setListNgrams l nt ns) $ toList m
144 ------------------------------------------------------------------------
145 -- | Re-index documents of a corpus with new ngrams (called orphans here)
146 reIndexWith :: ( HasNodeStory env err m
154 reIndexWith cId lId nt lts = do
155 -- printDebug "(cId,lId,nt,lts)" (cId, lId, nt, lts)
157 -- Getting [NgramsTerm]
159 <$> map (\(k,vs) -> k:vs)
161 <$> getTermsWith identity [lId] nt lts
163 -- Get all documents of the corpus
164 docs <- selectDocNodes cId
168 ngramsByDoc = map (HashMap.fromListWith (Map.unionWith (Map.unionWith (\(_a,b) (_a',b') -> (1,b+b')))))
169 $ map (map (\((k, cnt), v) -> (SimpleNgrams (text2ngrams k), over (traverse . traverse) (\p -> (p, cnt)) v)))
170 $ map (docNgrams nt ts) docs
172 -- printDebug "ngramsByDoc: " ngramsByDoc
174 -- Saving the indexation in database
175 _ <- mapM (saveDocNgramsWith lId) ngramsByDoc
176 -- _ <- refreshNgramsMaterialized
179 docNgrams :: NgramsType
181 -> Gargantext.Database.Admin.Types.Node.Context HyperdataDocument
182 -> [((MatchedText, TermsCount),
183 Map NgramsType (Map NodeId Int))]
184 docNgrams nt ts doc =
186 (termsInText (buildPatterns $ map (\k -> (Text.splitOn " " $ unNgramsTerm k, [])) ts)
187 $ Text.unlines $ catMaybes
188 [ doc ^. context_hyperdata . hd_title
189 , doc ^. context_hyperdata . hd_abstract
192 (List.cycle [Map.fromList $ [(nt, Map.singleton (doc ^. context_id) 1 )]])
194 toIndexedNgrams :: HashMap Text NgramsId -> Text -> Maybe (Indexed Int Ngrams)
195 toIndexedNgrams m t = Indexed <$> i <*> n
197 i = HashMap.lookup t m
198 n = Just (text2ngrams t)
200 ------------------------------------------------------------------------
201 jsonPostAsync :: ServerT JSONAPI (GargM Env GargError)
203 serveJobsAPI UpdateNgramsListJobJSON $ \jHandle f ->
204 postAsync' lId f jHandle
206 postAsync' :: (FlowCmdM env err m, MonadJobStatus m)
211 postAsync' l (WithJsonFile m _) jobHandle = do
213 markStarted 2 jobHandle
214 -- printDebug "New list as file" l
216 -- printDebug "Done" r
218 markProgress 1 jobHandle
220 corpus_node <- getNode l -- (Proxy :: Proxy HyperdataList)
221 let corpus_id = fromMaybe (panic "") (_node_parent_id corpus_node)
222 _ <- reIndexWith corpus_id l NgramsTerms (Set.fromList [MapTerm, CandidateTerm])
224 markComplete jobHandle
226 ------------------------------------------------------------------------
228 readCsvText :: Text -> Either Text [(Text, Text, Text)]
229 readCsvText t = case eDec of
230 Left err -> Left $ pack err
231 Right dec -> Right $ Vec.toList dec
233 lt = BSL.fromStrict $ P.encodeUtf8 t
234 eDec = Csv.decodeWith
235 (Csv.defaultDecodeOptions { Csv.decDelimiter = fromIntegral (P.ord '\t') })
236 Csv.HasHeader lt :: Either Prelude.String (Vector (Text, Text, Text))
238 parseCsvData :: [(Text, Text, Text)] -> Map NgramsTerm NgramsRepoElement
239 parseCsvData lst = Map.fromList $ conv <$> lst
241 conv (status, label, forms) =
242 (NgramsTerm label, NgramsRepoElement { _nre_size = 1
243 , _nre_list = case status == "map" of
245 False -> case status == "main" of
246 True -> CandidateTerm
248 , _nre_root = Nothing
249 , _nre_parent = Nothing
250 , _nre_children = MSet
252 $ map (\form -> (NgramsTerm form, ()))
253 $ filter (\w -> w /= "" && w /= label)
254 $ splitOn "|&|" forms
258 csvPost :: FlowCmdM env err m
261 -> m (Either Text ())
263 -- printDebug "[csvPost] l" l
264 -- printDebug "[csvPost] m" m
265 -- status label forms
266 let eLst = readCsvText m
268 Left err -> pure $ Left err
270 let p = parseCsvData lst
271 --printDebug "[csvPost] lst" lst
272 -- printDebug "[csvPost] p" p
273 _ <- setListNgrams l NgramsTerms p
274 -- printDebug "ReIndexing List" l
275 corpus_node <- getNode l -- (Proxy :: Proxy HyperdataList)
276 let corpus_id = fromMaybe (panic "") (_node_parent_id corpus_node)
277 _ <- reIndexWith corpus_id l NgramsTerms (Set.fromList [MapTerm, CandidateTerm])
281 ------------------------------------------------------------------------
282 csvPostAsync :: ServerT CSVAPI (GargM Env GargError)
284 serveJobsAPI UpdateNgramsListJobCSV $ \jHandle f -> do
285 markStarted 1 jHandle
286 ePost <- csvPost lId (_wtf_data f)
288 Left err -> markFailed (Just err) jHandle
289 Right () -> markComplete jHandle
291 getLatestJobStatus jHandle >>= printDebug "[csvPostAsync] job ended with joblog: "
293 ------------------------------------------------------------------------
296 -- | This is for debugging the CSV parser in the REPL
297 importCsvFile :: FlowCmdM env err m
298 => ListId -> P.FilePath -> m (Either Text ())
299 importCsvFile lId fp = do
300 contents <- liftBase $ P.readFile fp