]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Search.hs
add new synchronic clustering
[gargantext.git] / src / Gargantext / Text / Search.hs
1 {-|
2 Module : Gargantext.Text.Search
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 This search Engine is first made to clean CSV file according to a query.
11
12 Starting from this model, a specific Gargantext engine will be made
13 (using more metrics scores/features).
14 -}
15
16 {-# LANGUAGE NoImplicitPrelude #-}
17 {-# LANGUAGE OverloadedStrings, NamedFieldPuns #-}
18
19 module Gargantext.Text.Search where
20
21 import Data.SearchEngine
22
23 import Data.Ix
24
25 -- Usefull to use stopwords
26 -- import Data.Set (Set)
27 -- import qualified Data.Set as Set
28 import Data.Text (Text)
29
30 import Gargantext.Prelude
31 import Gargantext.Text.Terms.Mono (monoTexts)
32 import Gargantext.Text.Terms.Mono.Stem as ST
33 import Gargantext.Text.Corpus.Parsers.CSV
34
35 type DocId = Int
36
37 type DocSearchEngine = SearchEngine
38 CsvGargV3
39 DocId
40 DocField
41 NoFeatures
42
43 data DocField = TitleField
44 | AbstractField
45 deriving (Eq, Ord, Enum, Bounded, Ix, Show)
46
47 initialDocSearchEngine :: DocSearchEngine
48 initialDocSearchEngine =
49 initSearchEngine docSearchConfig defaultSearchRankParameters
50
51 docSearchConfig :: SearchConfig CsvGargV3 DocId DocField NoFeatures
52 docSearchConfig =
53 SearchConfig {
54 documentKey = d_docId,
55 extractDocumentTerms = extractTerms,
56 transformQueryTerm = normaliseQueryToken,
57 documentFeatureValue = const noFeatures
58 }
59 where
60 extractTerms :: CsvGargV3 -> DocField -> [Text]
61 extractTerms doc TitleField = monoTexts (d_title doc)
62 extractTerms doc AbstractField = monoTexts (d_abstract doc)
63
64 normaliseQueryToken :: Text -> DocField -> Text
65 normaliseQueryToken tok =
66 let tokStem = ST.stem ST.EN
67 in \field -> case field of
68 TitleField -> tokStem tok
69 AbstractField -> tokStem tok
70
71 defaultSearchRankParameters :: SearchRankParameters DocField NoFeatures
72 defaultSearchRankParameters =
73 SearchRankParameters {
74 paramK1,
75 paramB,
76 paramFieldWeights,
77 paramFeatureWeights = noFeatures,
78 paramFeatureFunctions = noFeatures,
79 paramResultsetSoftLimit = 2000,
80 paramResultsetHardLimit = 4000,
81 paramAutosuggestPrefilterLimit = 500,
82 paramAutosuggestPostfilterLimit = 500
83 }
84 where
85 paramK1 :: Float
86 paramK1 = 1.5
87
88 paramB :: DocField -> Float
89 paramB TitleField = 0.9
90 paramB AbstractField = 0.5
91
92 paramFieldWeights :: DocField -> Float
93 paramFieldWeights TitleField = 20
94 paramFieldWeights AbstractField = 5
95