]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Ngrams/List.hs
[FEAT] reindexing enabled in frontend
[gargantext.git] / src / Gargantext / API / Ngrams / List.hs
1 {-|
2 Module : Gargantext.API.Ngrams.List
3 Description : Get Ngrams (lists)
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE TemplateHaskell #-}
13 {-# LANGUAGE TypeOperators #-}
14
15 module Gargantext.API.Ngrams.List
16 where
17
18 import Control.Lens hiding (elements, Indexed)
19 import Data.Aeson
20 import Data.HashMap.Strict (HashMap)
21 import Data.Map (toList, fromList)
22 import Data.Maybe (catMaybes)
23 import Data.Set (Set)
24 import Data.Swagger (ToSchema, declareNamedSchema, genericDeclareNamedSchema)
25 import Data.Text (Text, concat, pack)
26 import GHC.Generics (Generic)
27 import Gargantext.API.Admin.Orchestrator.Types
28 import Gargantext.API.Ngrams (getNgramsTableMap, setListNgrams)
29 import Gargantext.API.Ngrams.Tools (getTermsWith)
30 import Gargantext.API.Ngrams.Types (RepoCmdM, Versioned(..), NgramsList, NgramsTerm(..))
31 import Gargantext.API.Node.Corpus.New.File (FileType(..))
32 import Gargantext.API.Prelude (GargServer, GargNoServer)
33 import Gargantext.Core.Text.Terms.WithList (buildPatterns, termsInText)
34 import Gargantext.Core.Types.Main (ListType(..))
35 import Gargantext.Core.Utils.Prefix (unPrefixSwagger)
36 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
37 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams)
38 import Gargantext.Database.Action.Metrics.NgramsByNode (getOccByNgramsOnlyFast')
39 import Gargantext.Database.Admin.Types.Hyperdata.Document
40 import Gargantext.Database.Admin.Types.Node
41 import Gargantext.Database.Query.Table.Ngrams (insertNgrams)
42 import Gargantext.Database.Query.Table.NodeNode (selectDocNodes)
43 import Gargantext.Database.Schema.Ngrams
44 import Gargantext.Database.Schema.Node
45 import Gargantext.Database.Types (Indexed(..))
46 import Gargantext.Prelude
47 import Network.HTTP.Media ((//), (/:))
48 import Servant
49 import Servant.Job.Async
50 import Servant.Job.Utils (jsonOptions)
51 import Web.FormUrlEncoded (FromForm)
52 import qualified Data.HashMap.Strict as HashMap
53 import qualified Data.List as List
54 import qualified Data.Map as Map
55 import qualified Data.Text as Text
56
57 ------------------------------------------------------------------------
58 type API = Get '[JSON, HTML] (Headers '[Header "Content-Disposition" Text] NgramsList)
59 -- :<|> ReqBody '[JSON] NgramsList :> Post '[JSON] Bool
60 :<|> PostAPI
61
62 api :: ListId -> GargServer API
63 api l = get l :<|> postAsync l
64
65 data HTML
66 instance Accept HTML where
67 contentType _ = "text" // "html" /: ("charset", "utf-8")
68 instance ToJSON a => MimeRender HTML a where
69 mimeRender _ = encode
70
71 ------------------------------------------------------------------------
72 get :: RepoCmdM env err m =>
73 ListId -> m (Headers '[Header "Content-Disposition" Text] NgramsList)
74 get lId = do
75 lst <- get' lId
76 let (NodeId id') = lId
77 return $ addHeader (concat [ "attachment; filename=GarganText_NgramsList-"
78 , pack $ show id'
79 , ".json"
80 ]
81 ) lst
82
83 get' :: RepoCmdM env err m
84 => ListId -> m NgramsList
85 get' lId = fromList
86 <$> zip ngramsTypes
87 <$> mapM (getNgramsTableMap lId) ngramsTypes
88
89 ------------------------------------------------------------------------
90 -- TODO : purge list
91 -- TODO talk
92 post :: FlowCmdM env err m
93 => ListId
94 -> NgramsList
95 -> m Bool
96 post l m = do
97 -- TODO check with Version for optim
98 _ <- mapM (\(nt, Versioned _v ns) -> setListNgrams l nt ns) $ toList m
99 -- TODO reindex
100 pure True
101
102
103 -----------------------------------------------------------------------------
104 -- | Re-index documents of a corpus with new ngrams (called orphans here)
105 reIndexWith :: CorpusId
106 -> ListId
107 -> NgramsType
108 -> Set ListType
109 -> GargNoServer ()
110 reIndexWith cId lId nt lts = do
111 -- Getting [NgramsTerm]
112 ts <- List.concat
113 <$> map (\(k,vs) -> k:vs)
114 <$> HashMap.toList
115 <$> getTermsWith identity [lId] nt lts
116
117 --printDebug "ts" ts
118
119 -- Taking the ngrams with 0 occurrences only (orphans)
120 occs <- getOccByNgramsOnlyFast' cId lId nt ts
121
122 let orphans = List.concat
123 $ map (\t -> case HashMap.lookup t occs of
124 Nothing -> [t]
125 Just n -> if n == 1 then [t] else [ ]
126 ) ts
127
128 -- Getting the Id of orphan ngrams
129 mapTextNgramsId <- insertNgrams (map (text2ngrams . unNgramsTerm) orphans)
130
131 printDebug "orphans" orphans
132
133 -- Get all documents of the corpus
134 docs <- selectDocNodes cId
135
136 printDebug "docs length" (List.length docs)
137
138 -- Checking Text documents where orphans match
139 -- TODO Tests here
140 let
141 ngramsByDoc = List.concat
142 $ map (\doc -> List.zip
143 (termsInText (buildPatterns $ map (\k -> ([unNgramsTerm k], [])) orphans)
144 $ Text.unlines $ catMaybes
145 [ doc ^. node_hyperdata . hd_title
146 , doc ^. node_hyperdata . hd_abstract
147 ]
148 )
149 (List.cycle [Map.fromList $ [(nt, Map.singleton (doc ^. node_id) 1 )]])
150 ) docs
151
152 printDebug "ngramsByDoc" ngramsByDoc
153
154 -- Saving the indexation in database
155 _ <- insertDocNgrams lId ( HashMap.fromList
156 $ catMaybes
157 $ map (\(t,d) -> (,) <$> toIndexedNgrams mapTextNgramsId t
158 <*> Just d ) ngramsByDoc
159 )
160 pure ()
161
162 toIndexedNgrams :: HashMap Text NgramsId -> Text -> Maybe (Indexed Int Ngrams)
163 toIndexedNgrams m t = Indexed <$> i <*> n
164 where
165 i = HashMap.lookup t m
166 n = Just (text2ngrams t)
167
168 ------------------------------------------------------------------------
169 ------------------------------------------------------------------------
170 type PostAPI = Summary "Update List"
171 :> "add"
172 :> "form"
173 :> "async"
174 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
175
176 postAsync :: ListId -> GargServer PostAPI
177 postAsync lId =
178 serveJobsAPI $
179 JobFunction (\f log' -> postAsync' lId f (liftBase . log'))
180
181 postAsync' :: FlowCmdM env err m
182 => ListId
183 -> WithFile
184 -> (JobLog -> m ())
185 -> m JobLog
186 postAsync' l (WithFile _ m _) logStatus = do
187
188 logStatus JobLog { _scst_succeeded = Just 0
189 , _scst_failed = Just 0
190 , _scst_remaining = Just 1
191 , _scst_events = Just []
192 }
193 _r <- post l m
194
195 pure JobLog { _scst_succeeded = Just 1
196 , _scst_failed = Just 0
197 , _scst_remaining = Just 0
198 , _scst_events = Just []
199 }
200
201 data WithFile = WithFile
202 { _wf_filetype :: !FileType
203 , _wf_data :: !NgramsList
204 , _wf_name :: !Text
205 } deriving (Eq, Show, Generic)
206
207 makeLenses ''WithFile
208 instance FromForm WithFile
209 instance FromJSON WithFile where
210 parseJSON = genericParseJSON $ jsonOptions "_wf_"
211 instance ToJSON WithFile where
212 toJSON = genericToJSON $ jsonOptions "_wf_"
213 instance ToSchema WithFile where
214 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wf_")