src/Gargantext/Core/Text/Terms/Mono/Token.hs

   1 {-|
   2 Module      : Gargantext.Core.Text.Ngrams.Token
   3 Description : Tokens and tokenizing a text
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 In computer science, lexical analysis, lexing or tokenization is the
  11 process of converting a sequence of characters (such as in a computer
  12 program or web page) into a sequence of tokens (strings with an assigned
  13 and thus identified meaning).
  14 Source: https://en.wikipedia.org/wiki/Tokenize
  15
  16 -}
  17
  18
  19 module Gargantext.Core.Text.Terms.Mono.Token (tokenize)
  20   where
  21
  22 import Data.Text (Text)
  23 import qualified Gargantext.Core.Text.Terms.Mono.Token.En as En
  24
  25 -- | Contexts depend on the lang
  26 --import Gargantext.Core (Lang(..))
  27
  28 type Token = Text
  29
  30 -- >>> tokenize "A rose is a rose is a rose."
  31 -- ["A","rose","is","a","rose","is","a","rose", "."]
  32
  33
  34 tokenize :: Text -> [Token]
  35 tokenize = En.tokenize
  36
  37 --data Context = Letter | Word | Sentence | Line | Paragraph
  38 --
  39 --tokenize' :: Lang -> Context -> [Token]
  40 --tokenize' = undefined
  41 --