{-| Module : Gargantext.Ngrams Description : Ngrams tools Copyright : (c) CNRS, 2018 License : AGPL + CECILL v3 Maintainer : team@gargantext.org Stability : experimental Portability : POSIX Ngrams exctration. Definitions of ngrams. n non negative integer -} {-# LANGUAGE NoImplicitPrelude #-} module Gargantext.Ngrams ( module Gargantext.Ngrams.Letters --, module Gargantext.Ngrams.Hetero , module Gargantext.Ngrams.CoreNLP , module Gargantext.Ngrams.Parser , module Gargantext.Ngrams.Occurrences , module Gargantext.Ngrams.TextMining , module Gargantext.Ngrams.Metrics , Ngrams(..), ngrams, occ, sumOcc, text2fis, clean , ListName(..), equivNgrams, isGram, sentences , ngramsTest --, module Gargantext.Ngrams.Words ) where import Gargantext.Ngrams.Letters --import Gargantext.Ngrams.Hetero import Gargantext.Ngrams.CoreNLP import Gargantext.Ngrams.Parser import Gargantext.Ngrams.Occurrences import Gargantext.Ngrams.TextMining --import Gargantext.Ngrams.Words import Gargantext.Ngrams.Metrics import qualified Gargantext.Ngrams.FrequentItemSet as FIS ----------------------------------------------------------------- import Data.List (sort) import Data.Char (Char, isAlphaNum, isSpace) import Data.Text (Text, filter, toLower, split, lines, concat) import qualified Data.Text as DT import Data.Text.IO (readFile) import Data.Map.Strict (Map , empty , insertWith, unionWith , lookupIndex --, fromList, keys ) import qualified Data.Map.Strict as M (filter) import Data.Foldable (foldl') import Gargantext.Prelude hiding (filter) -- Maybe useful later: --import NLP.Stemmer (stem, Stemmer(..)) --import Language.Aspell (check, suggest, spellChecker, spellCheckerWithOptions) --import Language.Aspell.Options (ACOption(..)) data ListName = Stop | Candidate | Graph deriving (Show, Eq) data Ngrams = Ngrams { _ngramsNgrams :: [Text] , _ngramsStem :: [Text] , _ngramsListName :: Maybe ListName } deriving (Show) equivNgrams :: Ngrams -> Ngrams -> Bool equivNgrams (Ngrams n1 s1 _) (Ngrams n2 s2 _) = (sort n1) == (sort n2) || (sort s1) == (sort s2) type Occ = Int --type Index = Int -- Data Ngrams = Monograms | MultiGrams ngrams :: Text -> [Text] ngrams xs = monograms $ toLower $ filter isGram xs clean :: Text -> Text clean txt = DT.map clean' txt where clean' '’' = '\'' clean' c = c monograms :: Text -> [Text] monograms txt = split isWord txt where isWord c = c `elem` [' ', '\'', ',', ';'] isGram :: Char -> Bool isGram c = isAlphaNum c || isSpace c || c `elem` ['-','/','\''] -- | Compute the occurrences (occ) occ :: Ord a => [a] -> Map a Occ occ xs = foldl' (\x y -> insertWith (+) y 1 x) empty xs -- TODO add groups and filter stops sumOcc :: Ord a => [Map a Occ] -> Map a Occ sumOcc xs = foldl' (unionWith (+)) empty xs --noApax :: Ord a => Map a Occ -> Map a Occ --noApax m = M.filter (>1) m -- | /!\ indexes are not the same: -- | Index ngrams from Map --indexNgram :: Ord a => Map a Occ -> Map Index a --indexNgram m = fromList (zip [1..] (keys m)) -- | Index ngrams from Map --ngramIndex :: Ord a => Map a Occ -> Map a Index --ngramIndex m = fromList (zip (keys m) [1..]) indexWith :: Ord a => Map a Occ -> [a] -> [Int] indexWith m xs = unMaybe $ map (\x -> lookupIndex x m) xs indexIt :: Ord a => [[a]] -> (Map a Int, [[Int]]) indexIt xs = (m, is) where m = sumOcc (map occ xs) is = map (indexWith m) xs list2fis :: Ord a => FIS.Frequency -> [[a]] -> (Map a Int, [FIS.Fis]) list2fis n xs = (m', fs) where (m, is) = indexIt xs m' = M.filter (>50000) m fs = FIS.all n is text2fis :: FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis]) text2fis n xs = list2fis n (map ngrams xs) --text2fisWith :: FIS.Size -> FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis]) --text2fisWith = undefined ------------------------------------------------------------------- -- Contexts of text sentences :: Text -> [Text] sentences txt = split isStop txt isStop :: Char -> Bool isStop c = c `elem` ['.','?','!'] -- | Tests -- TODO http://hackage.haskell.org/package/tokenize-0.3.0/docs/NLP-Tokenize-Text.html ngramsTest :: (IO [Text], IO [Text], IO (Map Text Occ)) ngramsTest = (ws, ls, ocs) where txt = concat <$> lines <$> clean <$> readFile "Giono-arbres.txt" -- | Number of sentences ls = sentences <$> txt -- | Number of monograms used in the full text ws = ngrams <$> txt -- | stem ngrams -- TODO -- group ngrams ocs = occ <$> ws