2 Module : Gargantext.API.Ngrams.List
3 Description : Get Ngrams (lists)
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE TemplateHaskell #-}
13 {-# LANGUAGE TypeOperators #-}
15 module Gargantext.API.Ngrams.List
18 import Control.Lens hiding (elements, Indexed)
20 import Data.HashMap.Strict (HashMap)
21 import Data.Map (toList, fromList)
22 import Data.Maybe (catMaybes)
24 import Data.Swagger (ToSchema, declareNamedSchema, genericDeclareNamedSchema)
25 import Data.Text (Text, concat, pack)
26 import GHC.Generics (Generic)
27 import Gargantext.API.Admin.Orchestrator.Types
28 import Gargantext.API.Ngrams (getNgramsTableMap, setListNgrams)
29 import Gargantext.API.Ngrams.Tools (getTermsWith)
30 import Gargantext.API.Ngrams.Types (RepoCmdM, Versioned(..), NgramsList, NgramsTerm(..))
31 import Gargantext.API.Node.Corpus.New.File (FileType(..))
32 import Gargantext.API.Prelude (GargServer, GargNoServer)
33 import Gargantext.Core.Text.Terms.WithList (buildPatterns, termsInText)
34 import Gargantext.Core.Types.Main (ListType(..))
35 import Gargantext.Core.Utils.Prefix (unPrefixSwagger)
36 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
37 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams)
38 import Gargantext.Database.Action.Metrics.NgramsByNode (getOccByNgramsOnlyFast')
39 import Gargantext.Database.Admin.Types.Hyperdata.Document
40 import Gargantext.Database.Admin.Types.Node
41 import Gargantext.Database.Query.Table.Ngrams (insertNgrams)
42 import Gargantext.Database.Query.Table.NodeNode (selectDocNodes)
43 import Gargantext.Database.Schema.Ngrams
44 import Gargantext.Database.Schema.Node
45 import Gargantext.Database.Types (Indexed(..))
46 import Gargantext.Prelude
47 import Network.HTTP.Media ((//), (/:))
49 import Servant.Job.Async
50 import Servant.Job.Utils (jsonOptions)
51 import Web.FormUrlEncoded (FromForm)
52 import qualified Data.HashMap.Strict as HashMap
53 import qualified Data.List as List
54 import qualified Data.Map as Map
55 import qualified Data.Text as Text
57 ------------------------------------------------------------------------
58 type API = Get '[JSON, HTML] (Headers '[Header "Content-Disposition" Text] NgramsList)
59 -- :<|> ReqBody '[JSON] NgramsList :> Post '[JSON] Bool
62 api :: ListId -> GargServer API
63 api l = get l :<|> postAsync l
66 instance Accept HTML where
67 contentType _ = "text" // "html" /: ("charset", "utf-8")
68 instance ToJSON a => MimeRender HTML a where
71 ------------------------------------------------------------------------
72 get :: RepoCmdM env err m =>
73 ListId -> m (Headers '[Header "Content-Disposition" Text] NgramsList)
76 let (NodeId id') = lId
77 return $ addHeader (concat [ "attachment; filename=GarganText_NgramsList-"
83 get' :: RepoCmdM env err m
84 => ListId -> m NgramsList
87 <$> mapM (getNgramsTableMap lId) ngramsTypes
89 ------------------------------------------------------------------------
92 post :: FlowCmdM env err m
97 -- TODO check with Version for optim
98 _ <- mapM (\(nt, Versioned _v ns) -> setListNgrams l nt ns) $ toList m
103 -----------------------------------------------------------------------------
104 -- | Re-index documents of a corpus with new ngrams (called orphans here)
105 reIndexWith :: CorpusId
110 reIndexWith cId lId nt lts = do
111 -- Getting [NgramsTerm]
113 <$> map (\(k,vs) -> k:vs)
115 <$> getTermsWith identity [lId] nt lts
119 -- Taking the ngrams with 0 occurrences only (orphans)
120 occs <- getOccByNgramsOnlyFast' cId lId nt ts
122 let orphans = List.concat
123 $ map (\t -> case HashMap.lookup t occs of
125 Just n -> if n == 1 then [t] else [ ]
128 -- Getting the Id of orphan ngrams
129 mapTextNgramsId <- insertNgrams (map (text2ngrams . unNgramsTerm) orphans)
131 printDebug "orphans" orphans
133 -- Get all documents of the corpus
134 docs <- selectDocNodes cId
136 printDebug "docs length" (List.length docs)
138 -- Checking Text documents where orphans match
141 ngramsByDoc = List.concat
142 $ map (\doc -> List.zip
143 (termsInText (buildPatterns $ map (\k -> ([unNgramsTerm k], [])) orphans)
144 $ Text.unlines $ catMaybes
145 [ doc ^. node_hyperdata . hd_title
146 , doc ^. node_hyperdata . hd_abstract
149 (List.cycle [Map.fromList $ [(nt, Map.singleton (doc ^. node_id) 1 )]])
152 printDebug "ngramsByDoc" ngramsByDoc
154 -- Saving the indexation in database
155 _ <- insertDocNgrams lId ( HashMap.fromList
157 $ map (\(t,d) -> (,) <$> toIndexedNgrams mapTextNgramsId t
158 <*> Just d ) ngramsByDoc
162 toIndexedNgrams :: HashMap Text NgramsId -> Text -> Maybe (Indexed Int Ngrams)
163 toIndexedNgrams m t = Indexed <$> i <*> n
165 i = HashMap.lookup t m
166 n = Just (text2ngrams t)
168 ------------------------------------------------------------------------
169 ------------------------------------------------------------------------
170 type PostAPI = Summary "Update List"
174 :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
176 postAsync :: ListId -> GargServer PostAPI
179 JobFunction (\f log' -> postAsync' lId f (liftBase . log'))
181 postAsync' :: FlowCmdM env err m
186 postAsync' l (WithFile _ m _) logStatus = do
188 logStatus JobLog { _scst_succeeded = Just 0
189 , _scst_failed = Just 0
190 , _scst_remaining = Just 1
191 , _scst_events = Just []
195 pure JobLog { _scst_succeeded = Just 1
196 , _scst_failed = Just 0
197 , _scst_remaining = Just 0
198 , _scst_events = Just []
201 data WithFile = WithFile
202 { _wf_filetype :: !FileType
203 , _wf_data :: !NgramsList
205 } deriving (Eq, Show, Generic)
207 makeLenses ''WithFile
208 instance FromForm WithFile
209 instance FromJSON WithFile where
210 parseJSON = genericParseJSON $ jsonOptions "_wf_"
211 instance ToJSON WithFile where
212 toJSON = genericToJSON $ jsonOptions "_wf_"
213 instance ToSchema WithFile where
214 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wf_")