]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Ngrams/List.hs
[FIX] multi-term reindex
[gargantext.git] / src / Gargantext / API / Ngrams / List.hs
1 {-|
2 Module : Gargantext.API.Ngrams.List
3 Description : Get Ngrams (lists)
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE TemplateHaskell #-}
13 {-# LANGUAGE TypeOperators #-}
14
15 module Gargantext.API.Ngrams.List
16 where
17
18 import Control.Lens hiding (elements, Indexed)
19 import Data.Aeson
20 import Data.HashMap.Strict (HashMap)
21 import Data.Map (toList, fromList)
22 import Data.Maybe (catMaybes)
23 import Data.Set (Set)
24 import Data.Swagger (ToSchema, declareNamedSchema, genericDeclareNamedSchema)
25 import Data.Text (Text, concat, pack)
26 import GHC.Generics (Generic)
27 import Gargantext.API.Admin.Orchestrator.Types
28 import Gargantext.API.Ngrams (getNgramsTableMap, setListNgrams)
29 import Gargantext.API.Ngrams.Tools (getTermsWith)
30 import Gargantext.API.Ngrams.Types
31 import Gargantext.API.Node.Corpus.New.File (FileType(..))
32 import Gargantext.API.Prelude (GargServer)
33 import Gargantext.Core.Text.Terms (ExtractedNgrams(..))
34 import Gargantext.Core.Text.Terms.WithList (buildPatterns, termsInText)
35 import Gargantext.Core.Types.Main (ListType(..))
36 import Gargantext.Core.Utils.Prefix (unPrefixSwagger)
37 import Gargantext.Database.Action.Flow (saveDocNgramsWith)
38 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
39 import Gargantext.Database.Action.Metrics.NgramsByNode (getOccByNgramsOnlyFast')
40 import Gargantext.Database.Admin.Types.Hyperdata.Document
41 import Gargantext.Database.Admin.Types.Node
42 import Gargantext.Database.Query.Table.NodeNode (selectDocNodes)
43 import Gargantext.Database.Schema.Ngrams
44 import Gargantext.Database.Schema.Node
45 import Gargantext.Database.Types (Indexed(..))
46 import Gargantext.Prelude
47 import Network.HTTP.Media ((//), (/:))
48 import Servant
49 import Servant.Job.Async
50 import Servant.Job.Utils (jsonOptions)
51 import Web.FormUrlEncoded (FromForm)
52 import qualified Data.HashMap.Strict as HashMap
53 import qualified Data.List as List
54 import qualified Data.Map as Map
55 import qualified Data.Text as Text
56
57 ------------------------------------------------------------------------
58 type API = Get '[JSON, HTML] (Headers '[Header "Content-Disposition" Text] NgramsList)
59 -- :<|> ReqBody '[JSON] NgramsList :> Post '[JSON] Bool
60 :<|> PostAPI
61
62 api :: ListId -> GargServer API
63 api l = get l :<|> postAsync l
64
65 data HTML
66 instance Accept HTML where
67 contentType _ = "text" // "html" /: ("charset", "utf-8")
68 instance ToJSON a => MimeRender HTML a where
69 mimeRender _ = encode
70
71 ------------------------------------------------------------------------
72 get :: RepoCmdM env err m =>
73 ListId -> m (Headers '[Header "Content-Disposition" Text] NgramsList)
74 get lId = do
75 lst <- get' lId
76 let (NodeId id') = lId
77 return $ addHeader (concat [ "attachment; filename=GarganText_NgramsList-"
78 , pack $ show id'
79 , ".json"
80 ]
81 ) lst
82
83 get' :: RepoCmdM env err m
84 => ListId -> m NgramsList
85 get' lId = fromList
86 <$> zip ngramsTypes
87 <$> mapM (getNgramsTableMap lId) ngramsTypes
88
89 ------------------------------------------------------------------------
90 -- TODO : purge list
91 -- TODO talk
92 post :: FlowCmdM env err m
93 => ListId
94 -> NgramsList
95 -> m Bool
96 post l m = do
97 -- TODO check with Version for optim
98 _ <- mapM (\(nt, Versioned _v ns) -> setListNgrams l nt ns) $ toList m
99 -- TODO reindex
100 pure True
101
102
103 -----------------------------------------------------------------------------
104 -- | Re-index documents of a corpus with new ngrams (called orphans here)
105 reIndexWith :: ( HasRepo env
106 , FlowCmdM env err m
107 )
108 => CorpusId
109 -> ListId
110 -> NgramsType
111 -> Set ListType
112 -> m ()
113 reIndexWith cId lId nt lts = do
114 -- Getting [NgramsTerm]
115 ts <- List.concat
116 <$> map (\(k,vs) -> k:vs)
117 <$> HashMap.toList
118 <$> getTermsWith identity [lId] nt lts
119
120 -- printDebug "ts" ts
121
122 -- Taking the ngrams with 0 occurrences only (orphans)
123 occs <- getOccByNgramsOnlyFast' cId lId nt ts
124
125 -- printDebug "occs" occs
126
127 let orphans = List.concat
128 $ map (\t -> case HashMap.lookup t occs of
129 Nothing -> [t]
130 Just n -> if n <= 1 then [t] else [ ]
131 ) ts
132
133 -- printDebug "orphans" orphans
134
135 -- Get all documents of the corpus
136 docs <- selectDocNodes cId
137 -- printDebug "docs length" (List.length docs)
138
139 -- Checking Text documents where orphans match
140 -- TODO Tests here
141 let
142 ngramsByDoc = map (HashMap.fromList)
143 $ map (map (\(k,v) -> (SimpleNgrams (text2ngrams k), v)))
144 $ map (\doc -> List.zip
145 (termsInText (buildPatterns $ map (\k -> (Text.splitOn " " $ unNgramsTerm k, [])) orphans)
146 $ Text.unlines $ catMaybes
147 [ doc ^. node_hyperdata . hd_title
148 , doc ^. node_hyperdata . hd_abstract
149 ]
150 )
151 (List.cycle [Map.fromList $ [(nt, Map.singleton (doc ^. node_id) 1 )]])
152 ) docs
153
154 -- printDebug "ngramsByDoc" ngramsByDoc
155
156 -- Saving the indexation in database
157 _ <- mapM (saveDocNgramsWith lId) ngramsByDoc
158
159 pure () -- ngramsByDoc
160
161 toIndexedNgrams :: HashMap Text NgramsId -> Text -> Maybe (Indexed Int Ngrams)
162 toIndexedNgrams m t = Indexed <$> i <*> n
163 where
164 i = HashMap.lookup t m
165 n = Just (text2ngrams t)
166
167 ------------------------------------------------------------------------
168 ------------------------------------------------------------------------
169 type PostAPI = Summary "Update List"
170 :> "add"
171 :> "form"
172 :> "async"
173 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
174
175 postAsync :: ListId -> GargServer PostAPI
176 postAsync lId =
177 serveJobsAPI $
178 JobFunction (\f log' -> postAsync' lId f (liftBase . log'))
179
180 postAsync' :: FlowCmdM env err m
181 => ListId
182 -> WithFile
183 -> (JobLog -> m ())
184 -> m JobLog
185 postAsync' l (WithFile _ m _) logStatus = do
186
187 logStatus JobLog { _scst_succeeded = Just 0
188 , _scst_failed = Just 0
189 , _scst_remaining = Just 1
190 , _scst_events = Just []
191 }
192 _r <- post l m
193
194 pure JobLog { _scst_succeeded = Just 1
195 , _scst_failed = Just 0
196 , _scst_remaining = Just 0
197 , _scst_events = Just []
198 }
199
200 data WithFile = WithFile
201 { _wf_filetype :: !FileType
202 , _wf_data :: !NgramsList
203 , _wf_name :: !Text
204 } deriving (Eq, Show, Generic)
205
206 makeLenses ''WithFile
207 instance FromForm WithFile
208 instance FromJSON WithFile where
209 parseJSON = genericParseJSON $ jsonOptions "_wf_"
210 instance ToJSON WithFile where
211 toJSON = genericToJSON $ jsonOptions "_wf_"
212 instance ToSchema WithFile where
213 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wf_")