{-| Module : Gargantext.Text.Search Description : All parsers of Gargantext in one file. Copyright : (c) CNRS, 2017 - present License : AGPL + CECILL v3 Maintainer : team@gargantext.org Stability : experimental Portability : POSIX This search Engine is first made to clean CSV file according to a query. Starting from this model, a specific Gargantext engine will be made (using more metrics scores/features). -} {-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE OverloadedStrings, NamedFieldPuns #-} module Gargantext.Text.Search where import Data.SearchEngine import Data.Ix -- Usefull to use stopwords -- import Data.Set (Set) -- import qualified Data.Set as Set import Data.Text (Text) import Gargantext.Prelude import Gargantext.Text.Terms.Mono (monoTexts) import Gargantext.Text.Terms.Mono.Stem as ST import Gargantext.Text.Parsers.CSV type DocId = Int type DocSearchEngine = SearchEngine Doc DocId DocField NoFeatures data DocField = TitleField | AbstractField deriving (Eq, Ord, Enum, Bounded, Ix, Show) initialDocSearchEngine :: DocSearchEngine initialDocSearchEngine = initSearchEngine docSearchConfig defaultSearchRankParameters docSearchConfig :: SearchConfig Doc DocId DocField NoFeatures docSearchConfig = SearchConfig { documentKey = d_docId, extractDocumentTerms = extractTerms, transformQueryTerm = normaliseQueryToken, documentFeatureValue = const noFeatures } where extractTerms :: Doc -> DocField -> [Text] extractTerms doc TitleField = monoTexts (d_title doc) extractTerms doc AbstractField = monoTexts (d_abstract doc) normaliseQueryToken :: Text -> DocField -> Text normaliseQueryToken tok = let tokStem = ST.stem ST.EN in \field -> case field of TitleField -> tokStem tok AbstractField -> tokStem tok defaultSearchRankParameters :: SearchRankParameters DocField NoFeatures defaultSearchRankParameters = SearchRankParameters { paramK1, paramB, paramFieldWeights, paramFeatureWeights = noFeatures, paramFeatureFunctions = noFeatures, paramResultsetSoftLimit = 2000, paramResultsetHardLimit = 4000, paramAutosuggestPrefilterLimit = 500, paramAutosuggestPostfilterLimit = 500 } where paramK1 :: Float paramK1 = 1.5 paramB :: DocField -> Float paramB TitleField = 0.9 paramB AbstractField = 0.5 paramFieldWeights :: DocField -> Float paramFieldWeights TitleField = 20 paramFieldWeights AbstractField = 5