]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Search.hs
Merge branch 'dev' into dev-doc-table-optimization
[gargantext.git] / src / Gargantext / Text / Search.hs
1 {-|
2 Module : Gargantext.Text.Search
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 This search Engine is first made to clean CSV file according to a query.
11
12 Starting from this model, a specific Gargantext engine will be made
13 (using more metrics scores/features).
14 -}
15
16 {-# LANGUAGE NamedFieldPuns #-}
17
18 module Gargantext.Text.Search where
19
20 import Data.SearchEngine
21
22 import Data.Ix
23
24 -- Usefull to use stopwords
25 -- import Data.Set (Set)
26 -- import qualified Data.Set as Set
27 import Data.Text (Text)
28
29 import Gargantext.Prelude
30 import Gargantext.Text.Terms.Mono (monoTexts)
31 import Gargantext.Text.Terms.Mono.Stem as ST
32 import Gargantext.Text.Corpus.Parsers.CSV
33
34 type DocId = Int
35
36 type DocSearchEngine = SearchEngine
37 CsvGargV3
38 DocId
39 DocField
40 NoFeatures
41
42 data DocField = TitleField
43 | AbstractField
44 deriving (Eq, Ord, Enum, Bounded, Ix, Show)
45
46 initialDocSearchEngine :: DocSearchEngine
47 initialDocSearchEngine =
48 initSearchEngine docSearchConfig defaultSearchRankParameters
49
50 docSearchConfig :: SearchConfig CsvGargV3 DocId DocField NoFeatures
51 docSearchConfig =
52 SearchConfig {
53 documentKey = d_docId,
54 extractDocumentTerms = extractTerms,
55 transformQueryTerm = normaliseQueryToken,
56 documentFeatureValue = const noFeatures
57 }
58 where
59 extractTerms :: CsvGargV3 -> DocField -> [Text]
60 extractTerms doc TitleField = monoTexts (d_title doc)
61 extractTerms doc AbstractField = monoTexts (d_abstract doc)
62
63 normaliseQueryToken :: Text -> DocField -> Text
64 normaliseQueryToken tok =
65 let tokStem = ST.stem ST.EN
66 in \field -> case field of
67 TitleField -> tokStem tok
68 AbstractField -> tokStem tok
69
70 defaultSearchRankParameters :: SearchRankParameters DocField NoFeatures
71 defaultSearchRankParameters =
72 SearchRankParameters {
73 paramK1,
74 paramB,
75 paramFieldWeights,
76 paramFeatureWeights = noFeatures,
77 paramFeatureFunctions = noFeatures,
78 paramResultsetSoftLimit = 2000,
79 paramResultsetHardLimit = 4000,
80 paramAutosuggestPrefilterLimit = 500,
81 paramAutosuggestPostfilterLimit = 500
82 }
83 where
84 paramK1 :: Float
85 paramK1 = 1.5
86
87 paramB :: DocField -> Float
88 paramB TitleField = 0.9
89 paramB AbstractField = 0.5
90
91 paramFieldWeights :: DocField -> Float
92 paramFieldWeights TitleField = 20
93 paramFieldWeights AbstractField = 5
94