]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Ngrams/List.hs
[openalex] Database type fixed
[gargantext.git] / src / Gargantext / API / Ngrams / List.hs
1 {-|
2 Module : Gargantext.API.Ngrams.List
3 Description : Get Ngrams (lists)
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE TemplateHaskell #-}
13 {-# LANGUAGE TypeOperators #-}
14
15 module Gargantext.API.Ngrams.List
16 where
17
18 import Control.Lens hiding (elements, Indexed)
19 import Data.Either (Either(..))
20 import Data.HashMap.Strict (HashMap)
21 import Data.Map.Strict (Map, toList)
22 import Data.Maybe (catMaybes, fromMaybe)
23 import Data.Set (Set)
24 import Data.Text (Text, concat, pack, splitOn)
25 import Data.Vector (Vector)
26 import Gargantext.API.Admin.EnvTypes (Env, GargJob(..))
27 import Gargantext.API.Admin.Orchestrator.Types
28 import Gargantext.API.Ngrams (setListNgrams)
29 import Gargantext.API.Ngrams.List.Types
30 import Gargantext.API.Ngrams.Prelude (getNgramsList)
31 import Gargantext.API.Ngrams.Tools (getTermsWith)
32 import Gargantext.API.Ngrams.Types
33 import Gargantext.API.Prelude (GargServer, GargM, GargError)
34 import Gargantext.API.Types
35 import Gargantext.Core.NodeStory
36 import Gargantext.Core.Text.Terms (ExtractedNgrams(..))
37 import Gargantext.Core.Text.Terms.WithList (MatchedText, buildPatterns, termsInText)
38 import Gargantext.Core.Types (TermsCount)
39 import Gargantext.Core.Types.Main (ListType(..))
40 import Gargantext.Database.Action.Flow (saveDocNgramsWith)
41 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
42 -- import Gargantext.Database.Action.Metrics.NgramsByContext (refreshNgramsMaterialized)
43 import Gargantext.Database.Admin.Types.Hyperdata.Document
44 import Gargantext.Database.Admin.Types.Node
45 import Gargantext.Database.Query.Table.Node (getNode)
46 import Gargantext.Database.Query.Table.NodeContext (selectDocNodes)
47 import Gargantext.Database.Schema.Context
48 import Gargantext.Database.Schema.Ngrams
49 import Gargantext.Database.Schema.Node (_node_parent_id)
50 import Gargantext.Database.Types (Indexed(..))
51 import Gargantext.Prelude
52 import Gargantext.Utils.Jobs (serveJobsAPI, MonadJobStatus(..))
53 import Servant
54 import qualified Data.ByteString.Lazy as BSL
55 import qualified Data.Csv as Csv
56 import qualified Data.HashMap.Strict as HashMap
57 import qualified Data.List as List
58 import qualified Data.Map.Strict as Map
59 import qualified Data.Set as Set
60 import qualified Data.Text as Text
61 import qualified Data.Vector as Vec
62 import qualified Gargantext.Database.Query.Table.Ngrams as TableNgrams
63 import qualified Gargantext.Utils.Servant as GUS
64 import qualified Prelude
65 import qualified Protolude as P
66 ------------------------------------------------------------------------
67 type GETAPI = Summary "Get List"
68 :> "lists"
69 :> Capture "listId" ListId
70 :> "json"
71 :> Get '[JSON, HTML] (Headers '[Header "Content-Disposition" Text] NgramsList)
72 :<|> "lists"
73 :> Capture "listId" ListId
74 :> "csv"
75 :> Get '[GUS.CSV] (Headers '[Header "Content-Disposition" Text] NgramsTableMap)
76 getApi :: GargServer GETAPI
77 getApi = getJson :<|> getCsv
78
79 ----------------------
80 type JSONAPI = Summary "Update List"
81 :> "lists"
82 :> Capture "listId" ListId
83 :> "add"
84 :> "form"
85 :> "async"
86 :> AsyncJobs JobLog '[FormUrlEncoded] WithJsonFile JobLog
87
88 jsonApi :: ServerT JSONAPI (GargM Env GargError)
89 jsonApi = jsonPostAsync
90
91 ----------------------
92 type CSVAPI = Summary "Update List (legacy v3 CSV)"
93 :> "lists"
94 :> Capture "listId" ListId
95 :> "csv"
96 :> "add"
97 :> "form"
98 :> "async"
99 :> AsyncJobs JobLog '[FormUrlEncoded] WithTextFile JobLog
100
101 csvApi :: ServerT CSVAPI (GargM Env GargError)
102 csvApi = csvPostAsync
103
104 ------------------------------------------------------------------------
105 getJson :: HasNodeStory env err m =>
106 ListId -> m (Headers '[Header "Content-Disposition" Text] NgramsList)
107 getJson lId = do
108 lst <- getNgramsList lId
109 let (NodeId id') = lId
110 return $ addHeader (concat [ "attachment; filename=GarganText_NgramsList-"
111 , pack $ show id'
112 , ".json"
113 ]
114 ) lst
115
116 getCsv :: HasNodeStory env err m =>
117 ListId -> m (Headers '[Header "Content-Disposition" Text] NgramsTableMap)
118 getCsv lId = do
119 lst <- getNgramsList lId
120 let (NodeId id') = lId
121 return $ case Map.lookup TableNgrams.NgramsTerms lst of
122 Nothing -> noHeader Map.empty
123 Just (Versioned { _v_data }) ->
124 addHeader (concat [ "attachment; filename=GarganText_NgramsList-"
125 , pack $ show id'
126 , ".csv"
127 ]
128 ) _v_data
129
130 ------------------------------------------------------------------------
131 -- TODO : purge list
132 -- TODO talk
133 setList :: FlowCmdM env err m
134 => ListId
135 -> NgramsList
136 -> m Bool
137 setList l m = do
138 -- TODO check with Version for optim
139 -- printDebug "New list as file" l
140 _ <- mapM (\(nt, Versioned _v ns) -> setListNgrams l nt ns) $ toList m
141 -- TODO reindex
142 pure True
143
144 ------------------------------------------------------------------------
145 -- | Re-index documents of a corpus with new ngrams (called orphans here)
146 reIndexWith :: ( HasNodeStory env err m
147 , FlowCmdM env err m
148 )
149 => CorpusId
150 -> ListId
151 -> NgramsType
152 -> Set ListType
153 -> m ()
154 reIndexWith cId lId nt lts = do
155 -- printDebug "(cId,lId,nt,lts)" (cId, lId, nt, lts)
156
157 -- Getting [NgramsTerm]
158 ts <- List.concat
159 <$> map (\(k,vs) -> k:vs)
160 <$> HashMap.toList
161 <$> getTermsWith identity [lId] nt lts
162
163 -- Get all documents of the corpus
164 docs <- selectDocNodes cId
165
166 let
167 -- fromListWith (<>)
168 ngramsByDoc = map (HashMap.fromListWith (Map.unionWith (Map.unionWith (\(_a,b) (_a',b') -> (1,b+b')))))
169 $ map (map (\((k, cnt), v) -> (SimpleNgrams (text2ngrams k), over (traverse . traverse) (\p -> (p, cnt)) v)))
170 $ map (docNgrams nt ts) docs
171
172 -- printDebug "ngramsByDoc: " ngramsByDoc
173
174 -- Saving the indexation in database
175 _ <- mapM (saveDocNgramsWith lId) ngramsByDoc
176 -- _ <- refreshNgramsMaterialized
177 pure ()
178
179 docNgrams :: NgramsType
180 -> [NgramsTerm]
181 -> Gargantext.Database.Admin.Types.Node.Context HyperdataDocument
182 -> [((MatchedText, TermsCount),
183 Map NgramsType (Map NodeId Int))]
184 docNgrams nt ts doc =
185 List.zip
186 (termsInText (buildPatterns $ map (\k -> (Text.splitOn " " $ unNgramsTerm k, [])) ts)
187 $ Text.unlines $ catMaybes
188 [ doc ^. context_hyperdata . hd_title
189 , doc ^. context_hyperdata . hd_abstract
190 ]
191 )
192 (List.cycle [Map.fromList $ [(nt, Map.singleton (doc ^. context_id) 1 )]])
193
194 toIndexedNgrams :: HashMap Text NgramsId -> Text -> Maybe (Indexed Int Ngrams)
195 toIndexedNgrams m t = Indexed <$> i <*> n
196 where
197 i = HashMap.lookup t m
198 n = Just (text2ngrams t)
199
200 ------------------------------------------------------------------------
201 jsonPostAsync :: ServerT JSONAPI (GargM Env GargError)
202 jsonPostAsync lId =
203 serveJobsAPI UpdateNgramsListJobJSON $ \jHandle f ->
204 postAsync' lId f jHandle
205
206 postAsync' :: (FlowCmdM env err m, MonadJobStatus m)
207 => ListId
208 -> WithJsonFile
209 -> JobHandle m
210 -> m ()
211 postAsync' l (WithJsonFile m _) jobHandle = do
212
213 markStarted 2 jobHandle
214 -- printDebug "New list as file" l
215 _ <- setList l m
216 -- printDebug "Done" r
217
218 markProgress 1 jobHandle
219
220 corpus_node <- getNode l -- (Proxy :: Proxy HyperdataList)
221 let corpus_id = fromMaybe (panic "") (_node_parent_id corpus_node)
222 _ <- reIndexWith corpus_id l NgramsTerms (Set.fromList [MapTerm, CandidateTerm])
223
224 markComplete jobHandle
225
226 ------------------------------------------------------------------------
227
228 readCsvText :: Text -> Either Text [(Text, Text, Text)]
229 readCsvText t = case eDec of
230 Left err -> Left $ pack err
231 Right dec -> Right $ Vec.toList dec
232 where
233 lt = BSL.fromStrict $ P.encodeUtf8 t
234 eDec = Csv.decodeWith
235 (Csv.defaultDecodeOptions { Csv.decDelimiter = fromIntegral (P.ord '\t') })
236 Csv.HasHeader lt :: Either Prelude.String (Vector (Text, Text, Text))
237
238 parseCsvData :: [(Text, Text, Text)] -> Map NgramsTerm NgramsRepoElement
239 parseCsvData lst = Map.fromList $ conv <$> lst
240 where
241 conv (status, label, forms) =
242 (NgramsTerm label, NgramsRepoElement { _nre_size = 1
243 , _nre_list = case status == "map" of
244 True -> MapTerm
245 False -> case status == "main" of
246 True -> CandidateTerm
247 False -> StopTerm
248 , _nre_root = Nothing
249 , _nre_parent = Nothing
250 , _nre_children = MSet
251 $ Map.fromList
252 $ map (\form -> (NgramsTerm form, ()))
253 $ filter (\w -> w /= "" && w /= label)
254 $ splitOn "|&|" forms
255 }
256 )
257
258 csvPost :: FlowCmdM env err m
259 => ListId
260 -> Text
261 -> m (Either Text ())
262 csvPost l m = do
263 -- printDebug "[csvPost] l" l
264 -- printDebug "[csvPost] m" m
265 -- status label forms
266 let eLst = readCsvText m
267 case eLst of
268 Left err -> pure $ Left err
269 Right lst -> do
270 let p = parseCsvData lst
271 --printDebug "[csvPost] lst" lst
272 -- printDebug "[csvPost] p" p
273 _ <- setListNgrams l NgramsTerms p
274 -- printDebug "ReIndexing List" l
275 corpus_node <- getNode l -- (Proxy :: Proxy HyperdataList)
276 let corpus_id = fromMaybe (panic "") (_node_parent_id corpus_node)
277 _ <- reIndexWith corpus_id l NgramsTerms (Set.fromList [MapTerm, CandidateTerm])
278
279 pure $ Right ()
280
281 ------------------------------------------------------------------------
282 csvPostAsync :: ServerT CSVAPI (GargM Env GargError)
283 csvPostAsync lId =
284 serveJobsAPI UpdateNgramsListJobCSV $ \jHandle f -> do
285 markStarted 1 jHandle
286 ePost <- csvPost lId (_wtf_data f)
287 case ePost of
288 Left err -> markFailed (Just err) jHandle
289 Right () -> markComplete jHandle
290
291 getLatestJobStatus jHandle >>= printDebug "[csvPostAsync] job ended with joblog: "
292
293 ------------------------------------------------------------------------
294
295
296 -- | This is for debugging the CSV parser in the REPL
297 importCsvFile :: FlowCmdM env err m
298 => ListId -> P.FilePath -> m (Either Text ())
299 importCsvFile lId fp = do
300 contents <- liftBase $ P.readFile fp
301 csvPost lId contents