src/Gargantext/Text/Ngrams/Stem.hs

   1 {-|
   2 Module      : Gargantext.Text.Ngrams.Stem
   3 Description :
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 In linguistic morphology and information retrieval, stemming is the
  11 process of reducing inflected (or sometimes derived) words to their word
  12 stem, base or root form—generally a written word form. The @stem@ needs
  13 not be identical to the morphological root of the word; it is usually
  14 sufficient that related words map to the same stem, even if this stem is
  15 not in itself a valid root.
  16 Source : https://en.wikipedia.org/wiki/Stemming
  17
  18 -}
  19
  20
  21 module Gargantext.Text.Ngrams.Stem (stem, Lang(..))
  22   where
  23
  24 import Data.Text (Text)
  25 import qualified Data.Text   as DT
  26 import qualified NLP.Stemmer as N
  27
  28 import Gargantext.Core (Lang(..))
  29
  30 -- (stem, Stemmer(..))
  31
  32 --import Language.Aspell (check, suggest, spellChecker, spellCheckerWithOptions)
  33 --import Language.Aspell.Options (ACOption(..))
  34
  35
  36 -- | Stemmer
  37
  38 -- A stemmer for English, for example, should identify the string "cats"
  39 -- (and possibly "catlike", "catty" etc.) as based on the root "cat".
  40
  41 -- and
  42 -- "stems", "stemmer", "stemming", "stemmed" as based on "stem". A stemming
  43 -- algorithm reduces the words "fishing", "fished", and "fisher" to the
  44 -- root word, "fish". On the other hand, "argue", "argued", "argues",
  45 -- "arguing", and "argus" reduce to the stem "argu" (illustrating the
  46 -- case where the stem is not itself a word or root) but "argument" and
  47 -- "arguments" reduce to the stem "argument".
  48
  49
  50 stem :: Lang -> Text -> Text
  51 stem lang = DT.pack . N.stem lang' . DT.unpack
  52   where
  53     lang' = case lang of
  54               EN -> N.English
  55               FR -> N.French
  56
  57
  58