]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Ngrams/List.hs
[FIX] Clean Text before sending it to NLP micro services + tests + clean code for...
[gargantext.git] / src / Gargantext / API / Ngrams / List.hs
1 {-|
2 Module : Gargantext.API.Ngrams.List
3 Description : Get Ngrams (lists)
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE TemplateHaskell #-}
13 {-# LANGUAGE TypeOperators #-}
14
15 module Gargantext.API.Ngrams.List
16 where
17
18 import Control.Lens hiding (elements, Indexed)
19 import Data.Either (Either(..))
20 import Data.HashMap.Strict (HashMap)
21 import Data.Map.Strict (Map, toList)
22 import Data.Maybe (catMaybes, fromMaybe)
23 import Data.Set (Set)
24 import Data.Text (Text, concat, pack, splitOn)
25 import Data.Vector (Vector)
26 import Gargantext.API.Admin.EnvTypes (Env, GargJob(..))
27 import Gargantext.API.Admin.Orchestrator.Types
28 import Gargantext.API.Ngrams (setListNgrams)
29 import Gargantext.API.Ngrams.List.Types
30 import Gargantext.API.Ngrams.Prelude (getNgramsList)
31 import Gargantext.API.Ngrams.Tools (getTermsWith)
32 import Gargantext.API.Ngrams.Types
33 import Gargantext.API.Prelude (GargServer, GargM, GargError)
34 import Gargantext.API.Types
35 import Gargantext.Core.NodeStory
36 import Gargantext.Core.Text.Terms (ExtractedNgrams(..))
37 import Gargantext.Core.Text.Terms.WithList (buildPatterns, termsInText)
38 import Gargantext.Core.Types.Main (ListType(..))
39 import Gargantext.Database.Action.Flow (saveDocNgramsWith)
40 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
41 import Gargantext.Database.Admin.Types.Hyperdata.Document
42 import Gargantext.Database.Admin.Types.Node
43 import Gargantext.Database.Query.Table.Node (getNode)
44 import Gargantext.Database.Query.Table.NodeContext (selectDocNodes)
45 import Gargantext.Database.Schema.Context
46 import Gargantext.Database.Schema.Ngrams
47 import Gargantext.Database.Schema.Node (_node_parent_id)
48 import Gargantext.Database.Types (Indexed(..))
49 import Gargantext.Prelude
50 import Gargantext.Utils.Jobs (serveJobsAPI, MonadJobStatus(..))
51 import Servant
52 -- import Servant.Job.Async
53 import qualified Data.ByteString.Lazy as BSL
54 import qualified Data.Csv as Csv
55 import qualified Data.HashMap.Strict as HashMap
56 import qualified Data.List as List
57 import qualified Data.Map.Strict as Map
58 import qualified Data.Set as Set
59 import qualified Data.Text as Text
60 import qualified Data.Vector as Vec
61 import qualified Gargantext.Database.Query.Table.Ngrams as TableNgrams
62 import qualified Gargantext.Utils.Servant as GUS
63 import qualified Prelude
64 import qualified Protolude as P
65 ------------------------------------------------------------------------
66 type GETAPI = Summary "Get List"
67 :> "lists"
68 :> Capture "listId" ListId
69 :> "json"
70 :> Get '[JSON, HTML] (Headers '[Header "Content-Disposition" Text] NgramsList)
71 :<|> "lists"
72 :> Capture "listId" ListId
73 :> "csv"
74 :> Get '[GUS.CSV] (Headers '[Header "Content-Disposition" Text] NgramsTableMap)
75 getApi :: GargServer GETAPI
76 getApi = getJson :<|> getCsv
77
78 ----------------------
79 type JSONAPI = Summary "Update List"
80 :> "lists"
81 :> Capture "listId" ListId
82 :> "add"
83 :> "form"
84 :> "async"
85 :> AsyncJobs JobLog '[FormUrlEncoded] WithJsonFile JobLog
86
87 jsonApi :: ServerT JSONAPI (GargM Env GargError)
88 jsonApi = jsonPostAsync
89
90 ----------------------
91 type CSVAPI = Summary "Update List (legacy v3 CSV)"
92 :> "lists"
93 :> Capture "listId" ListId
94 :> "csv"
95 :> "add"
96 :> "form"
97 :> "async"
98 :> AsyncJobs JobLog '[FormUrlEncoded] WithTextFile JobLog
99
100 csvApi :: ServerT CSVAPI (GargM Env GargError)
101 csvApi = csvPostAsync
102
103 ------------------------------------------------------------------------
104 getJson :: HasNodeStory env err m =>
105 ListId -> m (Headers '[Header "Content-Disposition" Text] NgramsList)
106 getJson lId = do
107 lst <- getNgramsList lId
108 let (NodeId id') = lId
109 return $ addHeader (concat [ "attachment; filename=GarganText_NgramsList-"
110 , pack $ show id'
111 , ".json"
112 ]
113 ) lst
114
115 getCsv :: HasNodeStory env err m =>
116 ListId -> m (Headers '[Header "Content-Disposition" Text] NgramsTableMap)
117 getCsv lId = do
118 lst <- getNgramsList lId
119 let (NodeId id') = lId
120 return $ case Map.lookup TableNgrams.NgramsTerms lst of
121 Nothing -> noHeader Map.empty
122 Just (Versioned { _v_data }) ->
123 addHeader (concat [ "attachment; filename=GarganText_NgramsList-"
124 , pack $ show id'
125 , ".csv"
126 ]
127 ) _v_data
128
129 ------------------------------------------------------------------------
130 -- TODO : purge list
131 -- TODO talk
132 setList :: FlowCmdM env err m
133 => ListId
134 -> NgramsList
135 -> m Bool
136 setList l m = do
137 -- TODO check with Version for optim
138 -- printDebug "New list as file" l
139 _ <- mapM (\(nt, Versioned _v ns) -> setListNgrams l nt ns) $ toList m
140 -- TODO reindex
141 pure True
142
143 ------------------------------------------------------------------------
144 -- | Re-index documents of a corpus with new ngrams (called orphans here)
145 reIndexWith :: ( HasNodeStory env err m
146 , FlowCmdM env err m
147 )
148 => CorpusId
149 -> ListId
150 -> NgramsType
151 -> Set ListType
152 -> m ()
153 reIndexWith cId lId nt lts = do
154 -- printDebug "(cId,lId,nt,lts)" (cId, lId, nt, lts)
155
156 -- Getting [NgramsTerm]
157 ts <- List.concat
158 <$> map (\(k,vs) -> k:vs)
159 <$> HashMap.toList
160 <$> getTermsWith identity [lId] nt lts
161
162 -- Get all documents of the corpus
163 docs <- selectDocNodes cId
164
165 let
166 -- fromListWith (<>)
167 ngramsByDoc = map (HashMap.fromListWith (Map.unionWith (Map.unionWith (\(_a,b) (_a',b') -> (1,b+b')))))
168 $ map (map (\((k, cnt), v) -> (SimpleNgrams (text2ngrams k), over (traverse . traverse) (\p -> (p, cnt)) v)))
169 $ map (\doc -> List.zip
170 (termsInText (buildPatterns $ map (\k -> (Text.splitOn " " $ unNgramsTerm k, [])) ts)
171 $ Text.unlines $ catMaybes
172 [ doc ^. context_hyperdata . hd_title
173 , doc ^. context_hyperdata . hd_abstract
174 ]
175 )
176 (List.cycle [Map.fromList $ [(nt, Map.singleton (doc ^. context_id) 1 )]])
177 ) docs
178
179 -- printDebug "ngramsByDoc: " ngramsByDoc
180
181 -- Saving the indexation in database
182 _ <- mapM (saveDocNgramsWith lId) ngramsByDoc
183
184 pure ()
185
186 toIndexedNgrams :: HashMap Text NgramsId -> Text -> Maybe (Indexed Int Ngrams)
187 toIndexedNgrams m t = Indexed <$> i <*> n
188 where
189 i = HashMap.lookup t m
190 n = Just (text2ngrams t)
191
192 ------------------------------------------------------------------------
193 jsonPostAsync :: ServerT JSONAPI (GargM Env GargError)
194 jsonPostAsync lId =
195 serveJobsAPI UpdateNgramsListJobJSON $ \jHandle f ->
196 postAsync' lId f jHandle
197
198 postAsync' :: (FlowCmdM env err m, MonadJobStatus m)
199 => ListId
200 -> WithJsonFile
201 -> JobHandle m
202 -> m ()
203 postAsync' l (WithJsonFile m _) jobHandle = do
204
205 markStarted 2 jobHandle
206 -- printDebug "New list as file" l
207 _ <- setList l m
208 -- printDebug "Done" r
209
210 markProgress 1 jobHandle
211
212 corpus_node <- getNode l -- (Proxy :: Proxy HyperdataList)
213 let corpus_id = fromMaybe (panic "") (_node_parent_id corpus_node)
214 _ <- reIndexWith corpus_id l NgramsTerms (Set.fromList [MapTerm, CandidateTerm])
215
216 markComplete jobHandle
217
218 ------------------------------------------------------------------------
219
220 readCsvText :: Text -> Either Text [(Text, Text, Text)]
221 readCsvText t = case eDec of
222 Left err -> Left $ pack err
223 Right dec -> Right $ Vec.toList dec
224 where
225 lt = BSL.fromStrict $ P.encodeUtf8 t
226 eDec = Csv.decodeWith
227 (Csv.defaultDecodeOptions { Csv.decDelimiter = fromIntegral (P.ord '\t') })
228 Csv.HasHeader lt :: Either Prelude.String (Vector (Text, Text, Text))
229
230 parseCsvData :: [(Text, Text, Text)] -> Map NgramsTerm NgramsRepoElement
231 parseCsvData lst = Map.fromList $ conv <$> lst
232 where
233 conv (status, label, forms) =
234 (NgramsTerm label, NgramsRepoElement { _nre_size = 1
235 , _nre_list = case status == "map" of
236 True -> MapTerm
237 False -> case status == "main" of
238 True -> CandidateTerm
239 False -> StopTerm
240 , _nre_root = Nothing
241 , _nre_parent = Nothing
242 , _nre_children = MSet
243 $ Map.fromList
244 $ map (\form -> (NgramsTerm form, ()))
245 $ filter (\w -> w /= "" && w /= label)
246 $ splitOn "|&|" forms
247 }
248 )
249
250 csvPost :: FlowCmdM env err m
251 => ListId
252 -> Text
253 -> m (Either Text ())
254 csvPost l m = do
255 -- printDebug "[csvPost] l" l
256 -- printDebug "[csvPost] m" m
257 -- status label forms
258 let eLst = readCsvText m
259 case eLst of
260 Left err -> pure $ Left err
261 Right lst -> do
262 let p = parseCsvData lst
263 --printDebug "[csvPost] lst" lst
264 -- printDebug "[csvPost] p" p
265 _ <- setListNgrams l NgramsTerms p
266 -- printDebug "ReIndexing List" l
267 corpus_node <- getNode l -- (Proxy :: Proxy HyperdataList)
268 let corpus_id = fromMaybe (panic "") (_node_parent_id corpus_node)
269 _ <- reIndexWith corpus_id l NgramsTerms (Set.fromList [MapTerm, CandidateTerm])
270
271 pure $ Right ()
272
273 ------------------------------------------------------------------------
274 csvPostAsync :: ServerT CSVAPI (GargM Env GargError)
275 csvPostAsync lId =
276 serveJobsAPI UpdateNgramsListJobCSV $ \jHandle f -> do
277 markStarted 1 jHandle
278 ePost <- csvPost lId (_wtf_data f)
279 case ePost of
280 Left err -> markFailed (Just err) jHandle
281 Right () -> markComplete jHandle
282
283 getLatestJobStatus jHandle >>= printDebug "[csvPostAsync] job ended with joblog: "
284
285 ------------------------------------------------------------------------
286
287
288 -- | This is for debugging the CSV parser in the REPL
289 importCsvFile :: FlowCmdM env err m
290 => ListId -> P.FilePath -> m (Either Text ())
291 importCsvFile lId fp = do
292 contents <- liftBase $ P.readFile fp
293 csvPost lId contents