]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Ngrams.hs
[CSV Parser] Parser for Gargantext (legacy) CSV files.
[gargantext.git] / src / Gargantext / Text / Ngrams.hs
1 {-|
2 Module : Gargantext.Text.Ngrams
3 Description : Ngrams definition and tools
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 An @n-gram@ is a contiguous sequence of n items from a given sample of
11 text. In Gargantext application the items are words, n is a non negative
12 integer.
13
14 Using Latin numerical prefixes, an n-gram of size 1 is referred to as a
15 "unigram"; size 2 is a "bigram" (or, less commonly, a "digram"); size
16 3 is a "trigram". English cardinal numbers are sometimes used, e.g.,
17 "four-gram", "five-gram", and so on.
18
19 Source: https://en.wikipedia.org/wiki/Ngrams
20
21 -}
22
23 {-# LANGUAGE NoImplicitPrelude #-}
24
25 module Gargantext.Text.Ngrams
26 where
27
28 import Data.Char (Char, isAlphaNum, isSpace)
29 import Data.Text (Text, split, splitOn, pack)
30
31 import Data.Set (Set)
32 import qualified Data.Set as S
33
34 import Gargantext.Prelude
35 import Gargantext.Core
36
37 import Gargantext.Text.Ngrams.Stem (stem)
38
39
40 data Ngrams = Ngrams { _ngrams_label :: [Text]
41 , _ngrams_stem :: Set Text
42 } deriving (Show)
43
44
45 data Terms = MonoGrams | MultiGrams
46 type MonoGrams = Text
47 type MultiGrams = [Text]
48
49
50 ngrams :: Text -> [Text]
51 ngrams = monograms
52
53 text2ngrams :: Lang -> Text -> Ngrams
54 text2ngrams lang txt = Ngrams txt' (S.fromList $ map (stem lang) txt')
55 where
56 txt' = splitOn (pack " ") txt
57
58
59 equivNgrams :: Ngrams -> Ngrams -> Bool
60 equivNgrams (Ngrams _ s1) (Ngrams _ s2) = s1 `S.isSubsetOf` s2
61 || s2 `S.isSubsetOf` s1
62
63 --monograms :: Text -> [Text]
64 --monograms xs = monograms $ toLower $ filter isGram xs
65
66 monograms :: Text -> [Text]
67 monograms txt = split isWord txt
68 where
69 isWord c = c `elem` [' ', '\'', ',', ';']
70
71 isGram :: Char -> Bool
72 isGram c = isAlphaNum c || isSpace c || c `elem` ['-','/','\'']
73