]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Ngrams/List.hs
[list] CSV parsing draft
[gargantext.git] / src / Gargantext / API / Ngrams / List.hs
1 {-|
2 Module : Gargantext.API.Ngrams.List
3 Description : Get Ngrams (lists)
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE MonoLocalBinds #-}
13 {-# LANGUAGE TemplateHaskell #-}
14 {-# LANGUAGE TypeOperators #-}
15
16 module Gargantext.API.Ngrams.List
17 where
18
19 import Control.Lens hiding (elements, Indexed)
20 import Data.Aeson
21 import qualified Data.ByteString.Lazy as BSL
22 import qualified Data.Csv as Csv
23 import Data.Either (Either(..))
24 import Data.HashMap.Strict (HashMap)
25 import qualified Data.HashMap.Strict as HashMap
26 import qualified Data.List as List
27 import Data.Map (Map, toList, fromList)
28 import qualified Data.Map as Map
29 import Data.Maybe (catMaybes)
30 import Data.Set (Set)
31 import Data.Text (Text, concat, pack)
32 import qualified Data.Text as Text
33 import Data.Vector (Vector)
34 import qualified Data.Vector as Vec
35 import Network.HTTP.Media ((//), (/:))
36 import qualified Prelude as Prelude
37 import Servant
38 import Servant.Job.Async
39
40 import qualified Protolude as P
41
42 import Gargantext.API.Admin.Orchestrator.Types
43 import Gargantext.API.Ngrams (getNgramsTableMap, setListNgrams)
44 import Gargantext.API.Ngrams.List.Types
45 import Gargantext.API.Ngrams.Tools (getTermsWith)
46 import Gargantext.API.Ngrams.Types
47 import Gargantext.API.Prelude (GargServer)
48 import Gargantext.Core.Text.Terms (ExtractedNgrams(..))
49 import Gargantext.Core.Text.Terms.WithList (buildPatterns, termsInText)
50 import Gargantext.Core.Types.Main (ListType(..))
51 import Gargantext.Database.Action.Flow (saveDocNgramsWith)
52 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
53 import Gargantext.Database.Action.Metrics.NgramsByNode (getOccByNgramsOnlyFast')
54 import Gargantext.Database.Admin.Types.Hyperdata.Document
55 import Gargantext.Database.Admin.Types.Node
56 import Gargantext.Database.Query.Table.NodeNode (selectDocNodes)
57 import Gargantext.Database.Schema.Ngrams
58 import Gargantext.Database.Schema.Node
59 import Gargantext.Database.Types (Indexed(..))
60 import Gargantext.Prelude
61
62
63 ------------------------------------------------------------------------
64 get :: RepoCmdM env err m =>
65 ListId -> m (Headers '[Header "Content-Disposition" Text] NgramsList)
66 get lId = do
67 lst <- get' lId
68 let (NodeId id') = lId
69 return $ addHeader (concat [ "attachment; filename=GarganText_NgramsList-"
70 , pack $ show id'
71 , ".json"
72 ]
73 ) lst
74
75 get' :: RepoCmdM env err m
76 => ListId -> m NgramsList
77 get' lId = fromList
78 <$> zip ngramsTypes
79 <$> mapM (getNgramsTableMap lId) ngramsTypes
80
81 ------------------------------------------------------------------------
82 -- TODO : purge list
83 -- TODO talk
84 post :: FlowCmdM env err m
85 => ListId
86 -> NgramsList
87 -> m Bool
88 post l m = do
89 -- TODO check with Version for optim
90 _ <- mapM (\(nt, Versioned _v ns) -> setListNgrams l nt ns) $ toList m
91 -- TODO reindex
92 pure True
93
94
95 -----------------------------------------------------------------------------
96 -- | Re-index documents of a corpus with new ngrams (called orphans here)
97 reIndexWith :: ( HasRepo env
98 , FlowCmdM env err m
99 )
100 => CorpusId
101 -> ListId
102 -> NgramsType
103 -> Set ListType
104 -> m ()
105 reIndexWith cId lId nt lts = do
106 -- Getting [NgramsTerm]
107 ts <- List.concat
108 <$> map (\(k,vs) -> k:vs)
109 <$> HashMap.toList
110 <$> getTermsWith identity [lId] nt lts
111
112 -- printDebug "ts" ts
113
114 -- Taking the ngrams with 0 occurrences only (orphans)
115 occs <- getOccByNgramsOnlyFast' cId lId nt ts
116
117 -- printDebug "occs" occs
118
119 let orphans = List.concat
120 $ map (\t -> case HashMap.lookup t occs of
121 Nothing -> [t]
122 Just n -> if n <= 1 then [t] else [ ]
123 ) ts
124
125 -- printDebug "orphans" orphans
126
127 -- Get all documents of the corpus
128 docs <- selectDocNodes cId
129 -- printDebug "docs length" (List.length docs)
130
131 -- Checking Text documents where orphans match
132 -- TODO Tests here
133 let
134 ngramsByDoc = map (HashMap.fromList)
135 $ map (map (\(k,v) -> (SimpleNgrams (text2ngrams k), v)))
136 $ map (\doc -> List.zip
137 (termsInText (buildPatterns $ map (\k -> (Text.splitOn " " $ unNgramsTerm k, [])) orphans)
138 $ Text.unlines $ catMaybes
139 [ doc ^. node_hyperdata . hd_title
140 , doc ^. node_hyperdata . hd_abstract
141 ]
142 )
143 (List.cycle [Map.fromList $ [(nt, Map.singleton (doc ^. node_id) 1 )]])
144 ) docs
145
146 -- printDebug "ngramsByDoc" ngramsByDoc
147
148 -- Saving the indexation in database
149 _ <- mapM (saveDocNgramsWith lId) ngramsByDoc
150
151 pure () -- ngramsByDoc
152
153 toIndexedNgrams :: HashMap Text NgramsId -> Text -> Maybe (Indexed Int Ngrams)
154 toIndexedNgrams m t = Indexed <$> i <*> n
155 where
156 i = HashMap.lookup t m
157 n = Just (text2ngrams t)
158
159 ------------------------------------------------------------------------
160 type PostAPI = Summary "Update List"
161 :> "add"
162 :> "form"
163 :> "async"
164 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
165
166 postAsync :: ListId -> GargServer PostAPI
167 postAsync lId =
168 serveJobsAPI $
169 JobFunction (\f log' -> postAsync' lId f (liftBase . log'))
170
171 postAsync' :: FlowCmdM env err m
172 => ListId
173 -> WithFile
174 -> (JobLog -> m ())
175 -> m JobLog
176 postAsync' l (WithFile _ m _) logStatus = do
177
178 logStatus JobLog { _scst_succeeded = Just 0
179 , _scst_failed = Just 0
180 , _scst_remaining = Just 1
181 , _scst_events = Just []
182 }
183 _r <- post l m
184
185 pure JobLog { _scst_succeeded = Just 1
186 , _scst_failed = Just 0
187 , _scst_remaining = Just 0
188 , _scst_events = Just []
189 }
190 ------------------------------------------------------------------------
191 readCsvText :: Text -> [(Text, Text, Text)]
192 readCsvText t = case eDec of
193 Left _ -> []
194 Right dec -> Vec.toList dec
195 where
196 lt = BSL.fromStrict $ P.encodeUtf8 t
197 eDec = Csv.decodeWith
198 (Csv.defaultDecodeOptions { Csv.decDelimiter = fromIntegral (P.ord '\t') })
199 Csv.HasHeader lt :: Either Prelude.String (Vector (Text, Text, Text))
200
201 parseCsvData :: [(Text, Text, Text)] -> Map NgramsTerm NgramsRepoElement
202 parseCsvData lst = Map.fromList $ conv <$> lst
203 where
204 conv (_status, label, _forms) =
205 (NgramsTerm label, NgramsRepoElement { _nre_size = 1
206 , _nre_list = CandidateTerm
207 , _nre_root = Nothing
208 , _nre_parent = Nothing
209 , _nre_children = MSet Map.empty })
210
211 csvPost :: FlowCmdM env err m
212 => ListId
213 -> Text
214 -> m Bool
215 csvPost l m = do
216 printDebug "[csvPost] l" l
217 -- printDebug "[csvPost] m" m
218 -- status label forms
219 let lst = readCsvText m
220 let p = parseCsvData lst
221 --printDebug "[csvPost] lst" lst
222 --printDebug "[csvPost] p" p
223 _ <- setListNgrams l NgramsTerms p
224 pure True
225 ------------------------------------------------------------------------
226 type CSVPostAPI = Summary "Update List (legacy v3 CSV)"
227 :> "csv"
228 :> "add"
229 :> "form"
230 :> "async"
231 :> AsyncJobs JobLog '[FormUrlEncoded] WithTextFile JobLog
232
233 csvPostAsync :: ListId -> GargServer CSVPostAPI
234 csvPostAsync lId =
235 serveJobsAPI $
236 JobFunction $ \f@(WithTextFile ft _ n) log' -> do
237 let log'' x = do
238 printDebug "[csvPostAsync] filetype" ft
239 printDebug "[csvPostAsync] name" n
240 liftBase $ log' x
241 csvPostAsync' lId f log''
242
243 csvPostAsync' :: FlowCmdM env err m
244 => ListId
245 -> WithTextFile
246 -> (JobLog -> m ())
247 -> m JobLog
248 csvPostAsync' l (WithTextFile _ m _) logStatus = do
249 logStatus JobLog { _scst_succeeded = Just 0
250 , _scst_failed = Just 0
251 , _scst_remaining = Just 1
252 , _scst_events = Just []
253 }
254 _r <- csvPost l m
255
256 pure JobLog { _scst_succeeded = Just 1
257 , _scst_failed = Just 0
258 , _scst_remaining = Just 0
259 , _scst_events = Just []
260 }
261
262 ------------------------------------------------------------------------
263 type API = Get '[JSON, HTML] (Headers '[Header "Content-Disposition" Text] NgramsList)
264 -- :<|> ReqBody '[JSON] NgramsList :> Post '[JSON] Bool
265 :<|> PostAPI
266 :<|> CSVPostAPI
267
268 api :: ListId -> GargServer API
269 api l = get l :<|> postAsync l :<|> csvPostAsync l
270
271 data HTML
272 instance Accept HTML where
273 contentType _ = "text" // "html" /: ("charset", "utf-8")
274 instance ToJSON a => MimeRender HTML a where
275 mimeRender _ = encode