]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Terms/Mono/Stem.hs
Merge branch 'dev-ngrams-repo' of ssh://delanoe.org/haskell-gargantext into dev-ngram...
[gargantext.git] / src / Gargantext / Text / Terms / Mono / Stem.hs
1 {-|
2 Module : Gargantext.Text.Ngrams.Stem
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 In linguistic morphology and information retrieval, stemming is the
11 process of reducing inflected (or sometimes derived) words to their word
12 stem, base or root form—generally a written word form. The @stem@ needs
13 not be identical to the morphological root of the word; it is usually
14 sufficient that related words map to the same stem, even if this stem is
15 not in itself a valid root.
16 Source : https://en.wikipedia.org/wiki/Stemming
17
18 -}
19
20 {-# LANGUAGE NoImplicitPrelude #-}
21
22 module Gargantext.Text.Terms.Mono.Stem (stem, Lang(..))
23 where
24
25 import Data.Text (Text)
26 import qualified Data.Text as DT
27 import qualified NLP.Stemmer as N
28
29 import Gargantext.Prelude
30 import Gargantext.Core (Lang(..))
31
32 -- (stem, Stemmer(..))
33
34 --import Language.Aspell (check, suggest, spellChecker, spellCheckerWithOptions)
35 --import Language.Aspell.Options (ACOption(..))
36
37
38 -- | Stemmer
39
40 -- A stemmer for English, for example, should identify the string "cats"
41 -- (and possibly "catlike", "catty" etc.) as based on the root "cat".
42
43 -- and
44 -- "stems", "stemmer", "stemming", "stemmed" as based on "stem". A stemming
45 -- algorithm reduces the words "fishing", "fished", and "fisher" to the
46 -- root word, "fish". On the other hand, "argue", "argued", "argues",
47 -- "arguing", and "argus" reduce to the stem "argu" (illustrating the
48 -- case where the stem is not itself a word or root) but "argument" and
49 -- "arguments" reduce to the stem "argument".
50
51
52 stem :: Lang -> Text -> Text
53 stem lang = DT.pack . N.stem lang' . DT.unpack
54 where
55 lang' = case lang of
56 EN -> N.English
57 FR -> N.French
58 --_ -> panic $ DT.pack "not implemented yet"
59
60
61