]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Terms/Mono/Token.hs
[Backup][WIP]
[gargantext.git] / src / Gargantext / Core / Text / Terms / Mono / Token.hs
1 {-|
2 Module : Gargantext.Core.Text.Ngrams.Token
3 Description : Tokens and tokenizing a text
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 In computer science, lexical analysis, lexing or tokenization is the
11 process of converting a sequence of characters (such as in a computer
12 program or web page) into a sequence of tokens (strings with an assigned
13 and thus identified meaning).
14 Source: https://en.wikipedia.org/wiki/Tokenize
15
16 -}
17
18
19 module Gargantext.Core.Text.Terms.Mono.Token (tokenize)
20 where
21
22 import Data.Text (Text)
23 import qualified Gargantext.Core.Text.Terms.Mono.Token.En as En
24
25 -- | Contexts depend on the lang
26 --import Gargantext.Core (Lang(..))
27
28 type Token = Text
29
30 -- >>> tokenize "A rose is a rose is a rose."
31 -- ["A","rose","is","a","rose","is","a","rose", "."]
32
33
34 tokenize :: Text -> [Token]
35 tokenize = En.tokenize
36
37 --data Context = Letter | Word | Sentence | Line | Paragraph
38 --
39 --tokenize' :: Lang -> Context -> [Token]
40 --tokenize' = undefined
41 --