]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Search.hs
[NLP] add support for arbitrary languages in INI file
[gargantext.git] / src / Gargantext / Core / Text / Search.hs
1 {-|
2 Module : Gargantext.Core.Text.Search
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 This search Engine is first made to clean CSV file according to a query.
11
12 Starting from this model, a specific Gargantext engine will be made
13 (using more metrics scores/features).
14 -}
15
16 module Gargantext.Core.Text.Search where
17
18 import Data.SearchEngine
19
20 import Data.Ix
21
22 -- Usefull to use stopwords
23 -- import Data.Set (Set)
24 -- import qualified Data.Set as Set
25 import Data.Text (Text)
26
27 import Gargantext.Prelude
28 import Gargantext.Core.Text.Terms.Mono (monoTexts)
29 import Gargantext.Core.Text.Terms.Mono.Stem as ST
30 import Gargantext.Core.Text.Corpus.Parsers.CSV
31
32 type DocId = Int
33
34 type DocSearchEngine = SearchEngine
35 CsvGargV3
36 DocId
37 DocField
38 NoFeatures
39
40 data DocField = TitleField
41 | AbstractField
42 deriving (Eq, Ord, Enum, Bounded, Ix, Show)
43
44 initialDocSearchEngine :: DocSearchEngine
45 initialDocSearchEngine =
46 initSearchEngine docSearchConfig defaultSearchRankParameters
47
48 docSearchConfig :: SearchConfig CsvGargV3 DocId DocField NoFeatures
49 docSearchConfig =
50 SearchConfig {
51 documentKey = d_docId,
52 extractDocumentTerms = extractTerms,
53 transformQueryTerm = normaliseQueryToken,
54 documentFeatureValue = const noFeatures
55 }
56 where
57 extractTerms :: CsvGargV3 -> DocField -> [Text]
58 extractTerms doc TitleField = monoTexts (d_title doc)
59 extractTerms doc AbstractField = monoTexts (d_abstract doc)
60
61 normaliseQueryToken :: Text -> DocField -> Text
62 normaliseQueryToken tok =
63 let tokStem = ST.stem ST.EN
64 in \field -> case field of
65 TitleField -> tokStem tok
66 AbstractField -> tokStem tok
67
68 defaultSearchRankParameters :: SearchRankParameters DocField NoFeatures
69 defaultSearchRankParameters =
70 SearchRankParameters {
71 paramK1,
72 paramB,
73 paramFieldWeights,
74 paramFeatureWeights = noFeatures,
75 paramFeatureFunctions = noFeatures,
76 paramResultsetSoftLimit = 2000,
77 paramResultsetHardLimit = 4000,
78 paramAutosuggestPrefilterLimit = 500,
79 paramAutosuggestPostfilterLimit = 500
80 }
81 where
82 paramK1 :: Float
83 paramK1 = 1.5
84
85 paramB :: DocField -> Float
86 paramB TitleField = 0.9
87 paramB AbstractField = 0.5
88
89 paramFieldWeights :: DocField -> Float
90 paramFieldWeights TitleField = 20
91 paramFieldWeights AbstractField = 5
92