src/Gargantext/Text.hs

   1 {-|
   2 Module      : Gargantext.Text
   3 Description : Ngrams tools
   4 Copyright   : (c) CNRS, 2018
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 Ngrams exctration.
  11
  12 Definitions of ngrams.
  13 n non negative integer
  14
  15 -}
  16
  17 {-# LANGUAGE NoImplicitPrelude #-}
  18 {-# LANGUAGE OverloadedStrings #-}
  19
  20 module Gargantext.Text
  21   where
  22
  23 import Data.Char (Char, isAlphaNum, isSpace)
  24 import Data.Text (Text, filter, toLower, split, splitOn)
  25 import qualified Data.Text as DT
  26 --import Data.Text.IO (readFile)
  27
  28 import Data.Set (Set)
  29 import qualified Data.Set as S
  30 import Data.Map.Strict  (Map
  31                         , empty
  32                         , insertWith, unionWith
  33                         , lookupIndex
  34                         --, fromList, keys
  35                         )
  36 import qualified Data.Map.Strict as M (filter)
  37 import Data.Foldable (foldl')
  38
  39 -----------------------------------------------------------------
  40 import Gargantext.Text.Ngrams.Stem.En
  41
  42 import qualified Gargantext.Text.Metrics.FrequentItemSet as FIS
  43 import Gargantext.Prelude hiding (filter)
  44 -----------------------------------------------------------------
  45
  46
  47 data ListName = Stop | Candidate | Graph
  48   deriving (Show, Eq)
  49
  50
  51
  52 data Ngroup = Ngroup { _ngroup_label  ::  Ngrams
  53                      , _ngroup_ngrams :: [Ngrams]
  54                      } deriving (Show)
  55
  56
  57 data Ngrams = Ngrams { _ngrams_label :: [Text]
  58                      , _ngrams_stem  :: Set Text
  59                      } deriving (Show)
  60
  61 text2ngrams :: Text -> Ngrams
  62 text2ngrams txt = Ngrams txt' (S.fromList $ map stem txt')
  63   where
  64     txt' = splitOn " " txt
  65
  66 equivNgrams :: Ngrams -> Ngrams -> Bool
  67 equivNgrams (Ngrams _ s1) (Ngrams _ s2) = s1 `S.isSubsetOf` s2
  68                                         || s2 `S.isSubsetOf` s1
  69
  70
  71 type Occ     = Int
  72 --type Index   = Int
  73
  74 -- Data Ngrams = Monograms | MultiGrams
  75
  76 ngrams :: Text -> [Text]
  77 ngrams xs = monograms $ toLower $ filter isGram xs
  78
  79 clean :: Text -> Text
  80 clean txt = DT.map clean' txt
  81   where
  82     clean' '’' = '\''
  83     clean' c  = c
  84
  85 monograms :: Text -> [Text]
  86 monograms txt = split isWord txt
  87   where
  88     isWord c = c `elem` [' ', '\'', ',', ';']
  89
  90 isGram :: Char -> Bool
  91 isGram  c  = isAlphaNum c || isSpace c || c `elem` ['-','/','\'']
  92
  93 -- | Compute the occurrences (occ)
  94 occ :: Ord a => [a] -> Map a Occ
  95 occ xs = foldl' (\x y -> insertWith (+) y 1 x) empty xs
  96
  97 -- TODO add groups and filter stops
  98 sumOcc :: Ord a => [Map a Occ] -> Map a Occ
  99 sumOcc xs = foldl' (unionWith (+)) empty xs
 100
 101 --noApax :: Ord a => Map a Occ -> Map a Occ
 102 --noApax m = M.filter (>1) m
 103
 104 -- | /!\ indexes are not the same:
 105
 106 -- | Index ngrams from Map
 107 --indexNgram :: Ord a => Map a Occ -> Map Index a
 108 --indexNgram m = fromList (zip [1..] (keys m))
 109
 110 -- | Index ngrams from Map
 111 --ngramIndex :: Ord a => Map a Occ -> Map a Index
 112 --ngramIndex m = fromList (zip (keys m) [1..])
 113
 114 indexWith :: Ord a => Map a Occ -> [a] -> [Int]
 115 indexWith m xs = unMaybe $ map (\x -> lookupIndex x m) xs
 116
 117 indexIt :: Ord a => [[a]] -> (Map a Int, [[Int]])
 118 indexIt xs = (m, is)
 119   where
 120     m  = sumOcc (map occ  xs)
 121     is = map    (indexWith m) xs
 122
 123 list2fis :: Ord a => FIS.Frequency -> [[a]] -> (Map a Int, [FIS.Fis])
 124 list2fis n xs = (m', fs)
 125   where
 126     (m, is) = indexIt xs
 127     m'      = M.filter (>50000) m
 128     fs      = FIS.all n is
 129
 130 text2fis :: FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
 131 text2fis n xs = list2fis n (map ngrams xs)
 132
 133 --text2fisWith :: FIS.Size -> FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
 134 --text2fisWith = undefined
 135
 136 -------------------------------------------------------------------
 137 -- Contexts of text
 138
 139 sentences :: Text -> [Text]
 140 sentences txt = split isStop txt
 141
 142 isStop :: Char -> Bool
 143 isStop c = c `elem` ['.','?','!']
 144
 145 ---- | https://en.wikipedia.org/wiki/Text_mining
 146 --testText :: Text
 147 --testText = DT.pack "Text mining, also referred to as text data mining, roughly equivalent to text analytics, is the process of deriving high-quality information from text. High-quality information is typically derived through the devising of patterns and trends through means such as statistical pattern learning. Text mining usually involves the process of structuring the input text (usually parsing, along with the addition of some derived linguistic features and the removal of others, and subsequent insertion into a database), deriving patterns within the structured data, and finally evaluation and interpretation of the output. 'High quality' in text mining usually refers to some combination of relevance, novelty, and interestingness. Typical text mining tasks include text categorization, text clustering, concept/entity extraction, production of granular taxonomies, sentiment analysis, document summarization, and entity relation modeling (i.e., learning relations between named entities). Text analysis involves information retrieval, lexical analysis to study word frequency distributions, pattern recognition, tagging/annotation, information extraction, data mining techniques including link and association analysis, visualization, and predictive analytics. The overarching goal is, essentially, to turn text into data for analysis, via application of natural language processing (NLP) and analytical methods. A typical application is to scan a set of documents written in a natural language and either model the document set for predictive classification purposes or populate a database or search index with the information extracted."
 148 --
 149 --
 150 --
 151 ---- | Tests
 152 ----ngramsTest :: [Text]
 153 --ngramsTest =  ocs
 154 --  where
 155 --    --txt = concat <$> lines <$> clean <$> readFile filePath
 156 --    txt = clean $ testText
 157 --    -- | Number of sentences
 158 --    ls   = sentences $ txt
 159 --    -- | Number of monograms used in the full text
 160 --    ws   = ngrams    $ txt
 161 --    -- | stem ngrams
 162 --    -- TODO
 163 --    -- group ngrams
 164 --    ocs  = occ       $ ws
 165 --
 166 --