]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Ngrams/List.hs
[FIX] Spinglass not connected components
[gargantext.git] / src / Gargantext / API / Ngrams / List.hs
1 {-|
2 Module : Gargantext.API.Ngrams.List
3 Description : Get Ngrams (lists)
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE TemplateHaskell #-}
13 {-# LANGUAGE TypeOperators #-}
14
15 module Gargantext.API.Ngrams.List
16 where
17
18 import Control.Lens hiding (elements, Indexed)
19 import Data.Aeson
20 import Data.Either (Either(..))
21 import Data.HashMap.Strict (HashMap)
22 import Data.Map (Map, toList)
23 import Data.Maybe (catMaybes, fromMaybe)
24 import Data.Set (Set)
25 import Data.Text (Text, concat, pack, splitOn)
26 import Data.Vector (Vector)
27 import Gargantext.API.Admin.Orchestrator.Types
28 import Gargantext.API.Ngrams (setListNgrams)
29 import Gargantext.API.Ngrams.List.Types
30 import Gargantext.API.Ngrams.Prelude (getNgramsList)
31 import Gargantext.API.Ngrams.Tools (getTermsWith)
32 import Gargantext.API.Ngrams.Types
33 import Gargantext.API.Prelude (GargServer)
34 import Gargantext.Core.NodeStory
35 import Gargantext.Core.Text.Terms (ExtractedNgrams(..))
36 import Gargantext.Core.Text.Terms.WithList (buildPatterns, termsInText)
37 import Gargantext.Core.Types.Main (ListType(..))
38 import Gargantext.Database.Action.Flow (saveDocNgramsWith)
39 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
40 import Gargantext.Database.Action.Metrics.NgramsByContext (getOccByNgramsOnlyFast')
41 import Gargantext.Database.Admin.Types.Hyperdata.Document
42 import Gargantext.Database.Admin.Types.Node
43 import Gargantext.Database.Query.Table.Node (getNode)
44 import Gargantext.Database.Query.Table.NodeContext (selectDocNodes)
45 import Gargantext.Database.Schema.Context
46 import Gargantext.Database.Schema.Ngrams
47 import Gargantext.Database.Schema.Node (_node_parent_id)
48 import Gargantext.Database.Types (Indexed(..))
49 import Gargantext.Prelude
50 import Network.HTTP.Media ((//), (/:))
51 import Servant
52 import Servant.Job.Async
53 import qualified Data.ByteString.Lazy as BSL
54 import qualified Data.Csv as Csv
55 import qualified Data.HashMap.Strict as HashMap
56 import qualified Data.List as List
57 import qualified Data.Map as Map
58 import qualified Data.Set as Set
59 import qualified Data.Text as Text
60 import qualified Data.Vector as Vec
61 import qualified Prelude
62 import qualified Protolude as P
63 ------------------------------------------------------------------------
64 type GETAPI = Summary "Get List"
65 :> "lists"
66 :> Capture "listId" ListId
67 :> Get '[JSON, HTML] (Headers '[Header "Content-Disposition" Text] NgramsList)
68 getApi :: GargServer GETAPI
69 getApi = get
70
71 data HTML
72 instance Accept HTML where
73 contentType _ = "text" // "html" /: ("charset", "utf-8")
74 instance ToJSON a => MimeRender HTML a where
75 mimeRender _ = encode
76
77 ----------------------
78 type JSONAPI = Summary "Update List"
79 :> "lists"
80 :> Capture "listId" ListId
81 :> "add"
82 :> "form"
83 :> "async"
84 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
85
86 jsonApi :: GargServer JSONAPI
87 jsonApi = postAsync
88
89 ----------------------
90 type CSVAPI = Summary "Update List (legacy v3 CSV)"
91 :> "lists"
92 :> Capture "listId" ListId
93 :> "csv"
94 :> "add"
95 :> "form"
96 :> "async"
97 :> AsyncJobs JobLog '[FormUrlEncoded] WithTextFile JobLog
98
99 csvApi :: GargServer CSVAPI
100 csvApi = csvPostAsync
101
102 ------------------------------------------------------------------------
103 get :: HasNodeStory env err m =>
104 ListId -> m (Headers '[Header "Content-Disposition" Text] NgramsList)
105 get lId = do
106 lst <- getNgramsList lId
107 let (NodeId id') = lId
108 return $ addHeader (concat [ "attachment; filename=GarganText_NgramsList-"
109 , pack $ show id'
110 , ".json"
111 ]
112 ) lst
113
114 ------------------------------------------------------------------------
115 -- TODO : purge list
116 -- TODO talk
117 setList :: FlowCmdM env err m
118 => ListId
119 -> NgramsList
120 -> m Bool
121 setList l m = do
122 -- TODO check with Version for optim
123 printDebug "New list as file" l
124 _ <- mapM (\(nt, Versioned _v ns) -> setListNgrams l nt ns) $ toList m
125 -- TODO reindex
126 pure True
127
128 ------------------------------------------------------------------------
129 -- | Re-index documents of a corpus with new ngrams (called orphans here)
130 reIndexWith :: ( HasNodeStory env err m
131 , FlowCmdM env err m
132 )
133 => CorpusId
134 -> ListId
135 -> NgramsType
136 -> Set ListType
137 -> m ()
138 reIndexWith cId lId nt lts = do
139 -- Getting [NgramsTerm]
140 ts <- List.concat
141 <$> map (\(k,vs) -> k:vs)
142 <$> HashMap.toList
143 <$> getTermsWith identity [lId] nt lts
144
145 -- printDebug "ts" ts
146
147 -- Taking the ngrams with 0 occurrences only (orphans)
148 occs <- getOccByNgramsOnlyFast' cId lId nt ts
149
150 -- printDebug "occs" occs
151
152 let orphans = List.concat
153 $ map (\t -> case HashMap.lookup t occs of
154 Nothing -> [t]
155 Just n -> if n <= 1 then [t] else [ ]
156 ) ts
157
158 -- printDebug "orphans" orphans
159
160 -- Get all documents of the corpus
161 docs <- selectDocNodes cId
162 -- printDebug "docs length" (List.length docs)
163
164 -- Checking Text documents where orphans match
165 -- TODO Tests here
166 let
167 ngramsByDoc = map (HashMap.fromList)
168 $ map (map (\(k,v) -> (SimpleNgrams (text2ngrams k), v)))
169 $ map (\doc -> List.zip
170 (termsInText (buildPatterns $ map (\k -> (Text.splitOn " " $ unNgramsTerm k, [])) orphans)
171 $ Text.unlines $ catMaybes
172 [ doc ^. context_hyperdata . hd_title
173 , doc ^. context_hyperdata . hd_abstract
174 ]
175 )
176 (List.cycle [Map.fromList $ [(nt, Map.singleton (doc ^. context_id) 1 )]])
177 ) docs
178
179 -- printDebug "ngramsByDoc" ngramsByDoc
180
181 -- Saving the indexation in database
182 _ <- mapM (saveDocNgramsWith lId) ngramsByDoc
183
184 pure () -- ngramsByDoc
185
186 toIndexedNgrams :: HashMap Text NgramsId -> Text -> Maybe (Indexed Int Ngrams)
187 toIndexedNgrams m t = Indexed <$> i <*> n
188 where
189 i = HashMap.lookup t m
190 n = Just (text2ngrams t)
191
192 ------------------------------------------------------------------------
193 type PostAPI = Summary "Update List"
194 :> "add"
195 :> "form"
196 :> "async"
197 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
198
199 postAsync :: GargServer JSONAPI
200 postAsync lId =
201 serveJobsAPI $
202 JobFunction (\f log' ->
203 let
204 log'' x = do
205 -- printDebug "postAsync ListId" x
206 liftBase $ log' x
207 in postAsync' lId f log'')
208
209 postAsync' :: FlowCmdM env err m
210 => ListId
211 -> WithFile
212 -> (JobLog -> m ())
213 -> m JobLog
214 postAsync' l (WithFile _ m _) logStatus = do
215
216 logStatus JobLog { _scst_succeeded = Just 0
217 , _scst_failed = Just 0
218 , _scst_remaining = Just 2
219 , _scst_events = Just []
220 }
221 printDebug "New list as file" l
222 _ <- setList l m
223 -- printDebug "Done" r
224
225 logStatus JobLog { _scst_succeeded = Just 1
226 , _scst_failed = Just 0
227 , _scst_remaining = Just 1
228 , _scst_events = Just []
229 }
230
231
232 corpus_node <- getNode l -- (Proxy :: Proxy HyperdataList)
233 let corpus_id = fromMaybe (panic "") (_node_parent_id corpus_node)
234 _ <- reIndexWith corpus_id l NgramsTerms (Set.fromList [MapTerm, CandidateTerm])
235
236 pure JobLog { _scst_succeeded = Just 2
237 , _scst_failed = Just 0
238 , _scst_remaining = Just 0
239 , _scst_events = Just []
240 }
241
242
243 ------------------------------------------------------------------------
244 type CSVPostAPI = Summary "Update List (legacy v3 CSV)"
245 :> "csv"
246 :> "add"
247 :> "form"
248 :> "async"
249 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
250
251 readCsvText :: Text -> [(Text, Text, Text)]
252 readCsvText t = case eDec of
253 Left _ -> []
254 Right dec -> Vec.toList dec
255 where
256 lt = BSL.fromStrict $ P.encodeUtf8 t
257 eDec = Csv.decodeWith
258 (Csv.defaultDecodeOptions { Csv.decDelimiter = fromIntegral (P.ord '\t') })
259 Csv.HasHeader lt :: Either Prelude.String (Vector (Text, Text, Text))
260
261 parseCsvData :: [(Text, Text, Text)] -> Map NgramsTerm NgramsRepoElement
262 parseCsvData lst = Map.fromList $ conv <$> lst
263 where
264 conv (status, label, forms) =
265 (NgramsTerm label, NgramsRepoElement { _nre_size = 1
266 , _nre_list = case status == "map" of
267 True -> MapTerm
268 False -> case status == "main" of
269 True -> CandidateTerm
270 False -> StopTerm
271 , _nre_root = Nothing
272 , _nre_parent = Nothing
273 , _nre_children = MSet
274 $ Map.fromList
275 $ map (\form -> (NgramsTerm form, ()))
276 $ filter (/= "")
277 $ splitOn "|&|" forms
278 }
279 )
280
281 csvPost :: FlowCmdM env err m
282 => ListId
283 -> Text
284 -> m Bool
285 csvPost l m = do
286 printDebug "[csvPost] l" l
287 -- printDebug "[csvPost] m" m
288 -- status label forms
289 let lst = readCsvText m
290 let p = parseCsvData lst
291 --printDebug "[csvPost] lst" lst
292 printDebug "[csvPost] p" p
293 _ <- setListNgrams l NgramsTerms p
294 printDebug "ReIndexing List" l
295 corpus_node <- getNode l -- (Proxy :: Proxy HyperdataList)
296 let corpus_id = fromMaybe (panic "") (_node_parent_id corpus_node)
297 _ <- reIndexWith corpus_id l NgramsTerms (Set.fromList [MapTerm, CandidateTerm])
298
299 pure True
300
301 ------------------------------------------------------------------------
302 csvPostAsync :: GargServer CSVAPI
303 csvPostAsync lId =
304 serveJobsAPI $
305 JobFunction $ \f@(WithTextFile ft _ n) log' -> do
306 let log'' x = do
307 printDebug "[csvPostAsync] filetype" ft
308 printDebug "[csvPostAsync] name" n
309 liftBase $ log' x
310 csvPostAsync' lId f log''
311
312
313 csvPostAsync' :: FlowCmdM env err m
314 => ListId
315 -> WithTextFile
316 -> (JobLog -> m ())
317 -> m JobLog
318 csvPostAsync' l (WithTextFile _ m _) logStatus = do
319 logStatus JobLog { _scst_succeeded = Just 0
320 , _scst_failed = Just 0
321 , _scst_remaining = Just 1
322 , _scst_events = Just []
323 }
324 _r <- csvPost l m
325
326 pure JobLog { _scst_succeeded = Just 1
327 , _scst_failed = Just 0
328 , _scst_remaining = Just 0
329 , _scst_events = Just []
330 }
331 ------------------------------------------------------------------------