]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Ngrams/List.hs
Merge remote-tracking branch 'origin/469-dev-email-change' into dev
[gargantext.git] / src / Gargantext / API / Ngrams / List.hs
1 {-|
2 Module : Gargantext.API.Ngrams.List
3 Description : Get Ngrams (lists)
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE TemplateHaskell #-}
13 {-# LANGUAGE TypeOperators #-}
14
15 module Gargantext.API.Ngrams.List
16 where
17
18 import Control.Lens hiding (elements, Indexed)
19 import Data.Either (Either(..))
20 import Data.HashMap.Strict (HashMap)
21 import Data.Map (Map, toList)
22 import Data.Maybe (catMaybes, fromMaybe)
23 import Data.Set (Set)
24 import Data.Text (Text, concat, pack, splitOn)
25 import Data.Vector (Vector)
26 import Gargantext.API.Admin.EnvTypes (Env, GargJob(..))
27 import Gargantext.API.Admin.Orchestrator.Types
28 import Gargantext.API.Ngrams (setListNgrams)
29 import Gargantext.API.Ngrams.List.Types
30 import Gargantext.API.Ngrams.Prelude (getNgramsList)
31 import Gargantext.API.Ngrams.Tools (getTermsWith)
32 import Gargantext.API.Ngrams.Types
33 import Gargantext.API.Prelude (GargServer, GargM, GargError)
34 import Gargantext.API.Types
35 import Gargantext.Core.NodeStory
36 import Gargantext.Core.Text.Terms (ExtractedNgrams(..))
37 import Gargantext.Core.Text.Terms.WithList (buildPatterns, termsInText)
38 import Gargantext.Core.Types.Main (ListType(..))
39 import Gargantext.Database.Action.Flow (saveDocNgramsWith)
40 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
41 import Gargantext.Database.Admin.Types.Hyperdata.Document
42 import Gargantext.Database.Admin.Types.Node
43 import Gargantext.Database.Query.Table.Node (getNode)
44 import Gargantext.Database.Query.Table.NodeContext (selectDocNodes)
45 import Gargantext.Database.Schema.Context
46 import Gargantext.Database.Schema.Ngrams
47 import Gargantext.Database.Schema.Node (_node_parent_id)
48 import Gargantext.Database.Types (Indexed(..))
49 import Gargantext.Prelude
50 import Gargantext.Utils.Jobs (serveJobsAPI)
51 import Servant
52 -- import Servant.Job.Async
53 import qualified Data.ByteString.Lazy as BSL
54 import qualified Data.Csv as Csv
55 import qualified Data.HashMap.Strict as HashMap
56 import qualified Data.List as List
57 import qualified Data.Map as Map
58 import qualified Data.Set as Set
59 import qualified Data.Text as Text
60 import qualified Data.Vector as Vec
61 import qualified Prelude
62 import qualified Protolude as P
63 ------------------------------------------------------------------------
64 type GETAPI = Summary "Get List"
65 :> "lists"
66 :> Capture "listId" ListId
67 :> Get '[JSON, HTML] (Headers '[Header "Content-Disposition" Text] NgramsList)
68 getApi :: GargServer GETAPI
69 getApi = get
70
71 ----------------------
72 type JSONAPI = Summary "Update List"
73 :> "lists"
74 :> Capture "listId" ListId
75 :> "add"
76 :> "form"
77 :> "async"
78 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
79
80 jsonApi :: ServerT JSONAPI (GargM Env GargError)
81 jsonApi = postAsync
82
83 ----------------------
84 type CSVAPI = Summary "Update List (legacy v3 CSV)"
85 :> "lists"
86 :> Capture "listId" ListId
87 :> "csv"
88 :> "add"
89 :> "form"
90 :> "async"
91 :> AsyncJobs JobLog '[FormUrlEncoded] WithTextFile JobLog
92
93 csvApi :: ServerT CSVAPI (GargM Env GargError)
94 csvApi = csvPostAsync
95
96 ------------------------------------------------------------------------
97 get :: HasNodeStory env err m =>
98 ListId -> m (Headers '[Header "Content-Disposition" Text] NgramsList)
99 get lId = do
100 lst <- getNgramsList lId
101 let (NodeId id') = lId
102 return $ addHeader (concat [ "attachment; filename=GarganText_NgramsList-"
103 , pack $ show id'
104 , ".json"
105 ]
106 ) lst
107
108 ------------------------------------------------------------------------
109 -- TODO : purge list
110 -- TODO talk
111 setList :: FlowCmdM env err m
112 => ListId
113 -> NgramsList
114 -> m Bool
115 setList l m = do
116 -- TODO check with Version for optim
117 printDebug "New list as file" l
118 _ <- mapM (\(nt, Versioned _v ns) -> setListNgrams l nt ns) $ toList m
119 -- TODO reindex
120 pure True
121
122 ------------------------------------------------------------------------
123 -- | Re-index documents of a corpus with new ngrams (called orphans here)
124 reIndexWith :: ( HasNodeStory env err m
125 , FlowCmdM env err m
126 )
127 => CorpusId
128 -> ListId
129 -> NgramsType
130 -> Set ListType
131 -> m ()
132 reIndexWith cId lId nt lts = do
133 printDebug "(cId,lId,nt,lts)" (cId, lId, nt, lts)
134
135 -- Getting [NgramsTerm]
136 ts <- List.concat
137 <$> map (\(k,vs) -> k:vs)
138 <$> HashMap.toList
139 <$> getTermsWith identity [lId] nt lts
140
141
142 let orphans = ts {- List.concat
143 $ map (\t -> case HashMap.lookup t occs of
144 Nothing -> [t]
145 Just n -> if n <= 1 then [t] else [ ]
146 ) ts
147 -}
148
149 printDebug "orphans" orphans
150
151 -- Get all documents of the corpus
152 docs <- selectDocNodes cId
153 -- printDebug "docs length" (List.length docs)
154
155 -- Checking Text documents where orphans match
156 -- TODO Tests here
157 let
158 -- fromListWith (<>)
159 ngramsByDoc = map (HashMap.fromList)
160 $ map (map (\((k, cnt), v) -> (SimpleNgrams (text2ngrams k), over (traverse . traverse) (\p -> (p, cnt)) v)))
161 $ map (\doc -> List.zip
162 (termsInText (buildPatterns $ map (\k -> (Text.splitOn " " $ unNgramsTerm k, [])) orphans)
163 $ Text.unlines $ catMaybes
164 [ doc ^. context_hyperdata . hd_title
165 , doc ^. context_hyperdata . hd_abstract
166 ]
167 )
168 (List.cycle [Map.fromList $ [(nt, Map.singleton (doc ^. context_id) 1 )]])
169 ) docs
170
171 printDebug "ngramsByDoc: " ngramsByDoc
172
173 -- Saving the indexation in database
174 _ <- mapM (saveDocNgramsWith lId) ngramsByDoc
175
176 pure ()
177
178 toIndexedNgrams :: HashMap Text NgramsId -> Text -> Maybe (Indexed Int Ngrams)
179 toIndexedNgrams m t = Indexed <$> i <*> n
180 where
181 i = HashMap.lookup t m
182 n = Just (text2ngrams t)
183
184 ------------------------------------------------------------------------
185 type PostAPI = Summary "Update List"
186 :> "add"
187 :> "form"
188 :> "async"
189 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
190
191 postAsync :: ListId -> ServerT PostAPI (GargM Env GargError)
192 postAsync lId =
193 serveJobsAPI UpdateNgramsListJobJSON $ \f log' ->
194 let
195 log'' x = do
196 -- printDebug "postAsync ListId" x
197 liftBase $ log' x
198 in postAsync' lId f log''
199
200 postAsync' :: FlowCmdM env err m
201 => ListId
202 -> WithFile
203 -> (JobLog -> m ())
204 -> m JobLog
205 postAsync' l (WithFile _ m _) logStatus = do
206
207 logStatus JobLog { _scst_succeeded = Just 0
208 , _scst_failed = Just 0
209 , _scst_remaining = Just 2
210 , _scst_events = Just []
211 }
212 printDebug "New list as file" l
213 _ <- setList l m
214 -- printDebug "Done" r
215
216 logStatus JobLog { _scst_succeeded = Just 1
217 , _scst_failed = Just 0
218 , _scst_remaining = Just 1
219 , _scst_events = Just []
220 }
221
222
223 corpus_node <- getNode l -- (Proxy :: Proxy HyperdataList)
224 let corpus_id = fromMaybe (panic "") (_node_parent_id corpus_node)
225 _ <- reIndexWith corpus_id l NgramsTerms (Set.fromList [MapTerm, CandidateTerm])
226
227 pure JobLog { _scst_succeeded = Just 2
228 , _scst_failed = Just 0
229 , _scst_remaining = Just 0
230 , _scst_events = Just []
231 }
232
233
234 ------------------------------------------------------------------------
235 type CSVPostAPI = Summary "Update List (legacy v3 CSV)"
236 :> "csv"
237 :> "add"
238 :> "form"
239 :> "async"
240 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
241
242 readCsvText :: Text -> [(Text, Text, Text)]
243 readCsvText t = case eDec of
244 Left _ -> []
245 Right dec -> Vec.toList dec
246 where
247 lt = BSL.fromStrict $ P.encodeUtf8 t
248 eDec = Csv.decodeWith
249 (Csv.defaultDecodeOptions { Csv.decDelimiter = fromIntegral (P.ord '\t') })
250 Csv.HasHeader lt :: Either Prelude.String (Vector (Text, Text, Text))
251
252 parseCsvData :: [(Text, Text, Text)] -> Map NgramsTerm NgramsRepoElement
253 parseCsvData lst = Map.fromList $ conv <$> lst
254 where
255 conv (status, label, forms) =
256 (NgramsTerm label, NgramsRepoElement { _nre_size = 1
257 , _nre_list = case status == "map" of
258 True -> MapTerm
259 False -> case status == "main" of
260 True -> CandidateTerm
261 False -> StopTerm
262 , _nre_root = Nothing
263 , _nre_parent = Nothing
264 , _nre_children = MSet
265 $ Map.fromList
266 $ map (\form -> (NgramsTerm form, ()))
267 $ filter (\w -> w /= "" && w /= label)
268 $ splitOn "|&|" forms
269 }
270 )
271
272 csvPost :: FlowCmdM env err m
273 => ListId
274 -> Text
275 -> m Bool
276 csvPost l m = do
277 printDebug "[csvPost] l" l
278 -- printDebug "[csvPost] m" m
279 -- status label forms
280 let lst = readCsvText m
281 let p = parseCsvData lst
282 --printDebug "[csvPost] lst" lst
283 printDebug "[csvPost] p" p
284 _ <- setListNgrams l NgramsTerms p
285 printDebug "ReIndexing List" l
286 corpus_node <- getNode l -- (Proxy :: Proxy HyperdataList)
287 let corpus_id = fromMaybe (panic "") (_node_parent_id corpus_node)
288 _ <- reIndexWith corpus_id l NgramsTerms (Set.fromList [MapTerm, CandidateTerm])
289
290 pure True
291
292 ------------------------------------------------------------------------
293 csvPostAsync :: ServerT CSVAPI (GargM Env GargError)
294 csvPostAsync lId =
295 serveJobsAPI UpdateNgramsListJobCSV $ \f@(WithTextFile ft _ n) log' -> do
296 let log'' x = do
297 printDebug "[csvPostAsync] filetype" ft
298 printDebug "[csvPostAsync] name" n
299 liftBase $ log' x
300 csvPostAsync' lId f log''
301
302
303 csvPostAsync' :: FlowCmdM env err m
304 => ListId
305 -> WithTextFile
306 -> (JobLog -> m ())
307 -> m JobLog
308 csvPostAsync' l (WithTextFile _ m _) logStatus = do
309 logStatus JobLog { _scst_succeeded = Just 0
310 , _scst_failed = Just 0
311 , _scst_remaining = Just 1
312 , _scst_events = Just []
313 }
314 _r <- csvPost l m
315
316 pure JobLog { _scst_succeeded = Just 1
317 , _scst_failed = Just 0
318 , _scst_remaining = Just 0
319 , _scst_events = Just []
320 }
321 ------------------------------------------------------------------------