src/Gargantext/Text/Terms/Mono/Token.hs

   1 {-|
   2 Module      : Gargantext.Text.Ngrams.Token
   3 Description : Tokens and tokenizing a text
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 In computer science, lexical analysis, lexing or tokenization is the
  11 process of converting a sequence of characters (such as in a computer
  12 program or web page) into a sequence of tokens (strings with an assigned
  13 and thus identified meaning).
  14 Source: https://en.wikipedia.org/wiki/Tokenize
  15
  16 -}
  17
  18 {-# LANGUAGE NoImplicitPrelude #-}
  19
  20 module Gargantext.Text.Terms.Mono.Token (tokenize)
  21   where
  22
  23 import Data.Text (Text)
  24 import qualified Gargantext.Text.Terms.Mono.Token.En as En
  25
  26 -- | Contexts depend on the lang
  27 --import Gargantext.Core (Lang(..))
  28
  29 type Token = Text
  30
  31 -- >>> tokenize "A rose is a rose is a rose."
  32 -- ["A","rose","is","a","rose","is","a","rose", "."]
  33
  34
  35 tokenize :: Text -> [Token]
  36 tokenize = En.tokenize
  37
  38 --data Context = Letter | Word | Sentence | Line | Paragraph
  39 --
  40 --tokenize' :: Lang -> Context -> [Token]
  41 --tokenize' = undefined
  42 --