2 Module : Gargantext.API.Ngrams.List
3 Description : Get Ngrams (lists)
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE TemplateHaskell #-}
13 {-# LANGUAGE TypeOperators #-}
15 module Gargantext.API.Ngrams.List
18 import Control.Lens hiding (elements, Indexed)
20 import Data.HashMap.Strict (HashMap)
21 import Data.Map (toList, fromList)
22 import Data.Maybe (catMaybes)
24 import Data.Swagger (ToSchema, declareNamedSchema, genericDeclareNamedSchema)
25 import Data.Text (Text, concat, pack)
26 import GHC.Generics (Generic)
27 import Gargantext.API.Admin.Orchestrator.Types
28 import Gargantext.API.Ngrams (getNgramsTableMap, setListNgrams)
29 import Gargantext.API.Ngrams.Tools (getTermsWith)
30 import Gargantext.API.Ngrams.Types
31 import Gargantext.API.Node.Corpus.New.File (FileType(..))
32 import Gargantext.API.Prelude (GargServer)
33 import Gargantext.Core.Text.Terms (ExtractedNgrams(..))
34 import Gargantext.Core.Text.Terms.WithList (buildPatterns, termsInText)
35 import Gargantext.Core.Types.Main (ListType(..))
36 import Gargantext.Core.Utils.Prefix (unPrefixSwagger)
37 import Gargantext.Database.Action.Flow (saveDocNgramsWith)
38 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
39 import Gargantext.Database.Action.Metrics.NgramsByNode (getOccByNgramsOnlyFast')
40 import Gargantext.Database.Admin.Types.Hyperdata.Document
41 import Gargantext.Database.Admin.Types.Node
42 import Gargantext.Database.Query.Table.NodeNode (selectDocNodes)
43 import Gargantext.Database.Schema.Ngrams
44 import Gargantext.Database.Schema.Node
45 import Gargantext.Database.Types (Indexed(..))
46 import Gargantext.Prelude
47 import Network.HTTP.Media ((//), (/:))
49 import Servant.Job.Async
50 import Servant.Job.Utils (jsonOptions)
51 import Web.FormUrlEncoded (FromForm)
52 import qualified Data.HashMap.Strict as HashMap
53 import qualified Data.List as List
54 import qualified Data.Map as Map
55 import qualified Data.Text as Text
57 ------------------------------------------------------------------------
58 type API = Get '[JSON, HTML] (Headers '[Header "Content-Disposition" Text] NgramsList)
59 -- :<|> ReqBody '[JSON] NgramsList :> Post '[JSON] Bool
62 api :: ListId -> GargServer API
63 api l = get l :<|> postAsync l
66 instance Accept HTML where
67 contentType _ = "text" // "html" /: ("charset", "utf-8")
68 instance ToJSON a => MimeRender HTML a where
71 ------------------------------------------------------------------------
72 get :: RepoCmdM env err m =>
73 ListId -> m (Headers '[Header "Content-Disposition" Text] NgramsList)
76 let (NodeId id') = lId
77 return $ addHeader (concat [ "attachment; filename=GarganText_NgramsList-"
83 get' :: RepoCmdM env err m
84 => ListId -> m NgramsList
87 <$> mapM (getNgramsTableMap lId) ngramsTypes
89 ------------------------------------------------------------------------
92 post :: FlowCmdM env err m
97 -- TODO check with Version for optim
98 _ <- mapM (\(nt, Versioned _v ns) -> setListNgrams l nt ns) $ toList m
103 -----------------------------------------------------------------------------
104 -- | Re-index documents of a corpus with new ngrams (called orphans here)
105 reIndexWith :: ( HasRepo env
113 reIndexWith cId lId nt lts = do
114 -- Getting [NgramsTerm]
116 <$> map (\(k,vs) -> k:vs)
118 <$> getTermsWith identity [lId] nt lts
120 -- printDebug "ts" ts
122 -- Taking the ngrams with 0 occurrences only (orphans)
123 occs <- getOccByNgramsOnlyFast' cId lId nt ts
125 -- printDebug "occs" occs
127 let orphans = List.concat
128 $ map (\t -> case HashMap.lookup t occs of
130 Just n -> if n <= 1 then [t] else [ ]
133 -- printDebug "orphans" orphans
135 -- Get all documents of the corpus
136 docs <- selectDocNodes cId
137 -- printDebug "docs length" (List.length docs)
139 -- Checking Text documents where orphans match
142 ngramsByDoc = map (HashMap.fromList)
143 $ map (map (\(k,v) -> (SimpleNgrams (text2ngrams k), v)))
144 $ map (\doc -> List.zip
145 (termsInText (buildPatterns $ map (\k -> (Text.splitOn " " $ unNgramsTerm k, [])) orphans)
146 $ Text.unlines $ catMaybes
147 [ doc ^. node_hyperdata . hd_title
148 , doc ^. node_hyperdata . hd_abstract
151 (List.cycle [Map.fromList $ [(nt, Map.singleton (doc ^. node_id) 1 )]])
154 -- printDebug "ngramsByDoc" ngramsByDoc
156 -- Saving the indexation in database
157 _ <- mapM (saveDocNgramsWith lId) ngramsByDoc
159 pure () -- ngramsByDoc
161 toIndexedNgrams :: HashMap Text NgramsId -> Text -> Maybe (Indexed Int Ngrams)
162 toIndexedNgrams m t = Indexed <$> i <*> n
164 i = HashMap.lookup t m
165 n = Just (text2ngrams t)
167 ------------------------------------------------------------------------
168 ------------------------------------------------------------------------
169 type PostAPI = Summary "Update List"
173 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
175 postAsync :: ListId -> GargServer PostAPI
178 JobFunction (\f log' -> postAsync' lId f (liftBase . log'))
180 postAsync' :: FlowCmdM env err m
185 postAsync' l (WithFile _ m _) logStatus = do
187 logStatus JobLog { _scst_succeeded = Just 0
188 , _scst_failed = Just 0
189 , _scst_remaining = Just 1
190 , _scst_events = Just []
194 pure JobLog { _scst_succeeded = Just 1
195 , _scst_failed = Just 0
196 , _scst_remaining = Just 0
197 , _scst_events = Just []
200 data WithFile = WithFile
201 { _wf_filetype :: !FileType
202 , _wf_data :: !NgramsList
204 } deriving (Eq, Show, Generic)
206 makeLenses ''WithFile
207 instance FromForm WithFile
208 instance FromJSON WithFile where
209 parseJSON = genericParseJSON $ jsonOptions "_wf_"
210 instance ToJSON WithFile where
211 toJSON = genericToJSON $ jsonOptions "_wf_"
212 instance ToSchema WithFile where
213 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wf_")