src/Gargantext/Text/Metrics.hs

   1 {-|
   2 Module      : Gargantext.Text.Metrics
   3 Description : All parsers of Gargantext in one file.
   4 Copyright   : (c) CNRS, 2017 - present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 Mainly reexport functions in @Data.Text.Metrics@
  11
  12
  13 TODO
  14 noApax :: Ord a => Map a Occ -> Map a Occ
  15 noApax m = M.filter (>1) m
  16
  17 -}
  18
  19 {-# LANGUAGE NoImplicitPrelude #-}
  20 {-# LANGUAGE OverloadedStrings #-}
  21
  22 module Gargantext.Text.Metrics
  23   where
  24
  25 import Data.Text (Text, pack)
  26 import Data.Map (Map)
  27
  28 import qualified Data.List as L
  29 import qualified Data.Map  as M
  30 import qualified Data.Set  as S
  31 import qualified Data.Text as T
  32 import Data.Tuple.Extra (both)
  33 --import GHC.Real (Ratio)
  34 --import qualified Data.Text.Metrics as DTM
  35 import Data.Array.Accelerate (toList)
  36
  37
  38 import Gargantext.Prelude
  39
  40 import Gargantext.Text.Metrics.Count (occurrences, cooc)
  41 import Gargantext.Text.Terms (TermType(MonoMulti), terms)
  42 import Gargantext.Core (Lang(EN))
  43 import Gargantext.Core.Types (Terms(..))
  44 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
  45
  46 import Gargantext.Viz.Graph.Distances.Matrice
  47 import Gargantext.Viz.Graph.Index
  48
  49 import qualified Data.Array.Accelerate.Interpreter as DAA
  50 import qualified Data.Array.Accelerate as DAA
  51
  52 import GHC.Real (round)
  53
  54 --filterCooc :: Ord t => Map (t, t) Int -> Map (t, t) Int
  55 --filterCooc m =
  56 ---- filterCooc m = foldl (\k -> maybe (panic "no key") identity $ M.lookup k m) M.empty selection
  57 ----(ti, fi)  = createIndices m
  58 -- . fromIndex fi $ filterMat $ cooc2mat ti m
  59
  60
  61 type ListSize  = Int
  62 type BinSize = Double
  63
  64 takeSome :: Ord t => ListSize -> BinSize -> [Scored t] -> [Scored t]
  65 takeSome l s scores = L.take l
  66                     $ takeSample n m
  67                     $ takeKmeans l'
  68                     $ L.reverse $ L.sortOn _scored_incExc scores
  69   where
  70     -- TODO : KMEAN split into 2 main clusters
  71     -- (advice: use accelerate-example kmeans version
  72     --  and maybe benchmark it to be sure)
  73     takeKmeans = L.take
  74     l' = 4000
  75     n = round ((fromIntegral l)/s)
  76     m = round $ (fromIntegral $ length scores) / (s)
  77     takeSample n m xs = L.concat $ map (L.take n)
  78                                  $ L.reverse $ map (L.sortOn _scored_incExc)
  79                                  $ splitEvery m
  80                                  $ L.reverse $ L.sortOn _scored_speGen xs
  81
  82
  83 data Scored t = Scored { _scored_terms  :: t
  84                        , _scored_incExc :: InclusionExclusion
  85                        , _scored_speGen :: SpecificityGenericity
  86                      } deriving (Show)
  87
  88 incExcSpeGen_sorted' :: Ord t => Map (t,t) Int -> [Scored t]
  89 incExcSpeGen_sorted' m = zipWith (\(i,t) (inc,spe) -> Scored t inc spe) (M.toList fi) scores
  90   where
  91     (ti,fi) = createIndices m
  92     (is, ss) = incExcSpeGen $ cooc2mat ti m
  93     scores = DAA.toList $ DAA.run $ DAA.zip (DAA.use is) (DAA.use ss)
  94
  95
  96 incExcSpeGen_sorted :: Ord t => Map (t,t) Int -> ([(t,Double)],[(t,Double)])
  97 incExcSpeGen_sorted m = both ordonne (incExcSpeGen $ cooc2mat ti m)
  98   where
  99     (ti,fi) = createIndices m
 100     ordonne x = L.reverse $ L.sortOn snd $ zip (map snd $ M.toList fi) (toList x)
 101
 102
 103
 104
 105 metrics_text :: Text
 106 metrics_text = T.intercalate " " metrics_sentences
 107
 108 metrics_sentences' :: [Text]
 109 metrics_sentences' = splitBy (Sentences 0) metrics_text
 110
 111 -- | Sentences
 112 metrics_sentences :: [Text]
 113 metrics_sentences = [ "There is a table with a glass of wine and a spoon."
 114                     , "I can see the glass on the table."
 115                     , "There was only a spoon on that table."
 116                     , "The glass just fall from the table, pouring wine everywhere."
 117                     , "I wish the glass did not contain wine."
 118                     ]
 119
 120 metrics_sentences_Test = metrics_sentences == metrics_sentences'
 121
 122 -- | Terms reordered to visually check occurrences
 123 -- >>>
 124 {- [ [["table"],["glass"],["wine"],["spoon"]]
 125    , [["glass"],["table"]]
 126    , [["spoon"],["table"]]
 127    , [["glass"],["table"],["wine"]]
 128    , [["glass"],["wine"]]
 129    ]
 130 -}
 131
 132 metrics_terms :: IO [[Terms]]
 133 metrics_terms = mapM (terms MonoMulti EN) $ splitBy (Sentences 0) metrics_text
 134
 135 -- | Occurrences
 136 {-
 137 fromList [ (fromList ["table"] ,fromList [(["table"] , 3 )])]
 138          , (fromList ["object"],fromList [(["object"], 3 )])
 139          , (fromList ["glas"]  ,fromList [(["glas"]  , 2 )])
 140          , (fromList ["spoon"] ,fromList [(["spoon"] , 2 )])
 141 -}
 142 metrics_occ = occurrences <$> L.concat <$> metrics_terms
 143
 144 {-
 145 -- fromList [((["glas"],["object"]),6)
 146             ,((["glas"],["spoon"]),4)
 147             ,((["glas"],["table"]),6),((["object"],["spoon"]),6),((["object"],["table"]),9),((["spoon"],["table"]),6)]
 148
 149 -}
 150 metrics_cooc = cooc <$> metrics_terms
 151
 152 metrics_cooc_mat = do
 153   m <- metrics_cooc
 154   let (ti,_) = createIndices m
 155   let mat_cooc = cooc2mat ti m
 156   pure ( ti
 157        , mat_cooc
 158        , incExcSpeGen_proba  mat_cooc
 159        , incExcSpeGen        mat_cooc
 160        )
 161
 162 metrics_incExcSpeGen = incExcSpeGen_sorted <$> metrics_cooc
 163