]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Ngrams/List.hs
Merge remote-tracking branch 'origin/145-dev-graph-explorer-search-tfidf' into dev
[gargantext.git] / src / Gargantext / API / Ngrams / List.hs
1 {-|
2 Module : Gargantext.API.Ngrams.List
3 Description : Get Ngrams (lists)
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE TemplateHaskell #-}
13 {-# LANGUAGE TypeOperators #-}
14
15 module Gargantext.API.Ngrams.List
16 where
17
18 import Control.Lens hiding (elements, Indexed)
19 import Data.Either (Either(..))
20 import Data.HashMap.Strict (HashMap)
21 import Data.Map (Map, toList)
22 import Data.Maybe (catMaybes, fromMaybe)
23 import Data.Set (Set)
24 import Data.Text (Text, concat, pack, splitOn)
25 import Data.Vector (Vector)
26 import Gargantext.API.Admin.EnvTypes (Env, GargJob(..))
27 import Gargantext.API.Admin.Orchestrator.Types
28 import Gargantext.API.Ngrams (setListNgrams)
29 import Gargantext.API.Ngrams.List.Types
30 import Gargantext.API.Ngrams.Prelude (getNgramsList)
31 import Gargantext.API.Ngrams.Tools (getTermsWith)
32 import Gargantext.API.Ngrams.Types
33 import Gargantext.API.Prelude (GargServer, GargM, GargError)
34 import Gargantext.API.Types
35 import Gargantext.Core.NodeStory
36 import Gargantext.Core.Text.Terms (ExtractedNgrams(..))
37 import Gargantext.Core.Text.Terms.WithList (buildPatterns, termsInText)
38 import Gargantext.Core.Types.Main (ListType(..))
39 import Gargantext.Database.Action.Flow (saveDocNgramsWith)
40 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
41 import Gargantext.Database.Admin.Types.Hyperdata.Document
42 import Gargantext.Database.Admin.Types.Node
43 import Gargantext.Database.Query.Table.Node (getNode)
44 import Gargantext.Database.Query.Table.NodeContext (selectDocNodes)
45 import Gargantext.Database.Schema.Context
46 import Gargantext.Database.Schema.Ngrams
47 import Gargantext.Database.Schema.Node (_node_parent_id)
48 import Gargantext.Database.Types (Indexed(..))
49 import Gargantext.Prelude
50 import Gargantext.Utils.Jobs (serveJobsAPI)
51 import Servant
52 -- import Servant.Job.Async
53 import qualified Data.ByteString.Lazy as BSL
54 import qualified Data.Csv as Csv
55 import qualified Data.HashMap.Strict as HashMap
56 import qualified Data.List as List
57 import qualified Data.Map as Map
58 import qualified Data.Set as Set
59 import qualified Data.Text as Text
60 import qualified Data.Vector as Vec
61 import qualified Prelude
62 import qualified Protolude as P
63 ------------------------------------------------------------------------
64 type GETAPI = Summary "Get List"
65 :> "lists"
66 :> Capture "listId" ListId
67 :> Get '[JSON, HTML] (Headers '[Header "Content-Disposition" Text] NgramsList)
68 getApi :: GargServer GETAPI
69 getApi = get
70
71 ----------------------
72 type JSONAPI = Summary "Update List"
73 :> "lists"
74 :> Capture "listId" ListId
75 :> "add"
76 :> "form"
77 :> "async"
78 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
79
80 jsonApi :: ServerT JSONAPI (GargM Env GargError)
81 jsonApi = postAsync
82
83 ----------------------
84 type CSVAPI = Summary "Update List (legacy v3 CSV)"
85 :> "lists"
86 :> Capture "listId" ListId
87 :> "csv"
88 :> "add"
89 :> "form"
90 :> "async"
91 :> AsyncJobs JobLog '[FormUrlEncoded] WithTextFile JobLog
92
93 csvApi :: ServerT CSVAPI (GargM Env GargError)
94 csvApi = csvPostAsync
95
96 ------------------------------------------------------------------------
97 get :: HasNodeStory env err m =>
98 ListId -> m (Headers '[Header "Content-Disposition" Text] NgramsList)
99 get lId = do
100 lst <- getNgramsList lId
101 let (NodeId id') = lId
102 return $ addHeader (concat [ "attachment; filename=GarganText_NgramsList-"
103 , pack $ show id'
104 , ".json"
105 ]
106 ) lst
107
108 ------------------------------------------------------------------------
109 -- TODO : purge list
110 -- TODO talk
111 setList :: FlowCmdM env err m
112 => ListId
113 -> NgramsList
114 -> m Bool
115 setList l m = do
116 -- TODO check with Version for optim
117 printDebug "New list as file" l
118 _ <- mapM (\(nt, Versioned _v ns) -> setListNgrams l nt ns) $ toList m
119 -- TODO reindex
120 pure True
121
122 ------------------------------------------------------------------------
123 -- | Re-index documents of a corpus with new ngrams (called orphans here)
124 reIndexWith :: ( HasNodeStory env err m
125 , FlowCmdM env err m
126 )
127 => CorpusId
128 -> ListId
129 -> NgramsType
130 -> Set ListType
131 -> m ()
132 reIndexWith cId lId nt lts = do
133 -- Getting [NgramsTerm]
134 ts <- List.concat
135 <$> map (\(k,vs) -> k:vs)
136 <$> HashMap.toList
137 <$> getTermsWith identity [lId] nt lts
138
139 -- printDebug "ts" ts
140
141 -- Taking the ngrams with 0 occurrences only (orphans)
142 -- occs <- getOccByNgramsOnlyFast' cId lId nt ts
143
144 -- printDebug "occs" occs
145
146 let orphans = ts {- List.concat
147 $ map (\t -> case HashMap.lookup t occs of
148 Nothing -> [t]
149 Just n -> if n <= 1 then [t] else [ ]
150 ) ts
151 -}
152 -- printDebug "orphans" orphans
153
154 -- Get all documents of the corpus
155 docs <- selectDocNodes cId
156 -- printDebug "docs length" (List.length docs)
157
158 -- Checking Text documents where orphans match
159 -- TODO Tests here
160 let
161 -- fromListWith (<>)
162 ngramsByDoc = map (HashMap.fromList)
163 $ map (map (\((k, cnt), v) -> (SimpleNgrams (text2ngrams k), over (traverse . traverse) (\p -> (p, cnt)) v)))
164 $ map (\doc -> List.zip
165 (termsInText (buildPatterns $ map (\k -> (Text.splitOn " " $ unNgramsTerm k, [])) orphans)
166 $ Text.unlines $ catMaybes
167 [ doc ^. context_hyperdata . hd_title
168 , doc ^. context_hyperdata . hd_abstract
169 ]
170 )
171 (List.cycle [Map.fromList $ [(nt, Map.singleton (doc ^. context_id) 1 )]])
172 ) docs
173
174 -- printDebug "ngramsByDoc" ngramsByDoc
175
176 -- Saving the indexation in database
177 _ <- mapM (saveDocNgramsWith lId) ngramsByDoc
178
179 pure () -- ngramsByDoc
180
181 toIndexedNgrams :: HashMap Text NgramsId -> Text -> Maybe (Indexed Int Ngrams)
182 toIndexedNgrams m t = Indexed <$> i <*> n
183 where
184 i = HashMap.lookup t m
185 n = Just (text2ngrams t)
186
187 ------------------------------------------------------------------------
188 type PostAPI = Summary "Update List"
189 :> "add"
190 :> "form"
191 :> "async"
192 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
193
194 postAsync :: ListId -> ServerT PostAPI (GargM Env GargError)
195 postAsync lId =
196 serveJobsAPI UpdateNgramsListJobJSON $ \f log' ->
197 let
198 log'' x = do
199 -- printDebug "postAsync ListId" x
200 liftBase $ log' x
201 in postAsync' lId f log''
202
203 postAsync' :: FlowCmdM env err m
204 => ListId
205 -> WithFile
206 -> (JobLog -> m ())
207 -> m JobLog
208 postAsync' l (WithFile _ m _) logStatus = do
209
210 logStatus JobLog { _scst_succeeded = Just 0
211 , _scst_failed = Just 0
212 , _scst_remaining = Just 2
213 , _scst_events = Just []
214 }
215 printDebug "New list as file" l
216 _ <- setList l m
217 -- printDebug "Done" r
218
219 logStatus JobLog { _scst_succeeded = Just 1
220 , _scst_failed = Just 0
221 , _scst_remaining = Just 1
222 , _scst_events = Just []
223 }
224
225
226 corpus_node <- getNode l -- (Proxy :: Proxy HyperdataList)
227 let corpus_id = fromMaybe (panic "") (_node_parent_id corpus_node)
228 _ <- reIndexWith corpus_id l NgramsTerms (Set.fromList [MapTerm, CandidateTerm])
229
230 pure JobLog { _scst_succeeded = Just 2
231 , _scst_failed = Just 0
232 , _scst_remaining = Just 0
233 , _scst_events = Just []
234 }
235
236
237 ------------------------------------------------------------------------
238 type CSVPostAPI = Summary "Update List (legacy v3 CSV)"
239 :> "csv"
240 :> "add"
241 :> "form"
242 :> "async"
243 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
244
245 readCsvText :: Text -> [(Text, Text, Text)]
246 readCsvText t = case eDec of
247 Left _ -> []
248 Right dec -> Vec.toList dec
249 where
250 lt = BSL.fromStrict $ P.encodeUtf8 t
251 eDec = Csv.decodeWith
252 (Csv.defaultDecodeOptions { Csv.decDelimiter = fromIntegral (P.ord '\t') })
253 Csv.HasHeader lt :: Either Prelude.String (Vector (Text, Text, Text))
254
255 parseCsvData :: [(Text, Text, Text)] -> Map NgramsTerm NgramsRepoElement
256 parseCsvData lst = Map.fromList $ conv <$> lst
257 where
258 conv (status, label, forms) =
259 (NgramsTerm label, NgramsRepoElement { _nre_size = 1
260 , _nre_list = case status == "map" of
261 True -> MapTerm
262 False -> case status == "main" of
263 True -> CandidateTerm
264 False -> StopTerm
265 , _nre_root = Nothing
266 , _nre_parent = Nothing
267 , _nre_children = MSet
268 $ Map.fromList
269 $ map (\form -> (NgramsTerm form, ()))
270 $ filter (\w -> w /= "" && w /= label)
271 $ splitOn "|&|" forms
272 }
273 )
274
275 csvPost :: FlowCmdM env err m
276 => ListId
277 -> Text
278 -> m Bool
279 csvPost l m = do
280 printDebug "[csvPost] l" l
281 -- printDebug "[csvPost] m" m
282 -- status label forms
283 let lst = readCsvText m
284 let p = parseCsvData lst
285 --printDebug "[csvPost] lst" lst
286 printDebug "[csvPost] p" p
287 _ <- setListNgrams l NgramsTerms p
288 printDebug "ReIndexing List" l
289 corpus_node <- getNode l -- (Proxy :: Proxy HyperdataList)
290 let corpus_id = fromMaybe (panic "") (_node_parent_id corpus_node)
291 _ <- reIndexWith corpus_id l NgramsTerms (Set.fromList [MapTerm, CandidateTerm])
292
293 pure True
294
295 ------------------------------------------------------------------------
296 csvPostAsync :: ServerT CSVAPI (GargM Env GargError)
297 csvPostAsync lId =
298 serveJobsAPI UpdateNgramsListJobCSV $ \f@(WithTextFile ft _ n) log' -> do
299 let log'' x = do
300 printDebug "[csvPostAsync] filetype" ft
301 printDebug "[csvPostAsync] name" n
302 liftBase $ log' x
303 csvPostAsync' lId f log''
304
305
306 csvPostAsync' :: FlowCmdM env err m
307 => ListId
308 -> WithTextFile
309 -> (JobLog -> m ())
310 -> m JobLog
311 csvPostAsync' l (WithTextFile _ m _) logStatus = do
312 logStatus JobLog { _scst_succeeded = Just 0
313 , _scst_failed = Just 0
314 , _scst_remaining = Just 1
315 , _scst_events = Just []
316 }
317 _r <- csvPost l m
318
319 pure JobLog { _scst_succeeded = Just 1
320 , _scst_failed = Just 0
321 , _scst_remaining = Just 0
322 , _scst_events = Just []
323 }
324 ------------------------------------------------------------------------