where
import Data.Text (Text, pack)
+import Data.Ord (comparing, Down(..))
import Data.Map (Map)
import qualified Data.List as L
import qualified Data.Map as M
import qualified Data.Array.Accelerate.Interpreter as DAA
import qualified Data.Array.Accelerate as DAA
+-- import Data.Array.Accelerate ((:.)(..), Z(..))
import GHC.Real (round)
-filterCooc :: Ord t => Map (t, t) Int -> Map (t, t) Int
-filterCooc cc = filterCooc' ts cc
- where
- ts = map _scored_terms $ takeSome 350 5 2 $ coocScored cc
+import Debug.Trace
+import Prelude (seq)
+
+data MapListSize = MapListSize Int
+data InclusionSize = InclusionSize Int
+data SampleBins = SampleBins Double
+data Clusters = Clusters Int
+data DefaultValue = DefaultValue Int
-filterCooc' :: Ord t => [t] -> Map (t, t) Int -> Map (t, t) Int
-filterCooc' ts m = foldl' (\m' k -> M.insert k (maybe errMessage identity $ M.lookup k m) m') M.empty selection
+data FilterConfig = FilterConfig { fc_mapListSize :: MapListSize
+ , fc_inclusionSize :: InclusionSize
+ , fc_sampleBins :: SampleBins
+ , fc_clusters :: Clusters
+ , fc_defaultValue :: DefaultValue
+ }
+
+filterCooc :: Ord t => FilterConfig -> Map (t, t) Int -> Map (t, t) Int
+filterCooc fc cc = (filterCooc' fc) ts cc
where
- errMessage = panic "Filter cooc: no key"
- selection = [(x,y) | x <- ts, y <- ts, x > y]
+ ts = map _scored_terms $ takeSome fc $ coocScored cc
-type MapListSize = Int
-type SampleBins = Double
-type Clusters = Int
+filterCooc' :: Ord t => FilterConfig -> [t] -> Map (t, t) Int -> Map (t, t) Int
+filterCooc' (FilterConfig _ _ _ _ (DefaultValue dv)) ts m = -- trace ("coocScored " <> show (length ts)) $
+ foldl' (\m' k -> M.insert k (maybe dv identity $ M.lookup k m) m')
+ M.empty selection
+ where
+ selection = [(x,y) | x <- ts
+ , y <- ts
+ -- , x >= y
+ ]
+
-- | Map list creation
-- Kmeans split into (Clusters::Int) main clusters with Inclusion/Exclusion (relevance score)
-- Sample the main cluster ordered by specificity/genericity in (SampleBins::Double) parts
-- each parts is then ordered by Inclusion/Exclusion
-- take n scored terms in each parts where n * SampleBins = MapListSize.
-takeSome :: Ord t => MapListSize -> SampleBins -> Clusters -> [Scored t] -> [Scored t]
-takeSome l s k scores = L.take l
+takeSome :: Ord t => FilterConfig -> [Scored t] -> [Scored t]
+takeSome (FilterConfig (MapListSize l) (InclusionSize l') (SampleBins s) (Clusters k) _) scores = L.take l
$ takeSample n m
- $ splitKmeans k scores
+ $ L.take l' $ sortWith (Down . _scored_incExc) scores
+ -- $ splitKmeans k scores
where
-- TODO: benchmark with accelerate-example kmeans version
- splitKmeans x xs = elements
- $ V.head
+ splitKmeans x xs = L.concat $ map elements
+ $ V.take (k-1)
$ kmeans (\i -> VU.fromList ([(_scored_incExc i :: Double)]))
euclidSq x xs
n = round ((fromIntegral l)/s)
m = round $ (fromIntegral $ length scores) / (s)
- takeSample n m xs = L.concat $ map (L.take n)
- $ L.reverse $ map (L.sortOn _scored_incExc)
+ takeSample n m xs = -- trace ("splitKmeans " <> show (length xs)) $
+ L.concat $ map (L.take n)
+ $ map (sortWith (Down . _scored_incExc))
-- TODO use kmeans s instead of splitEvery
-- in order to split in s heteregenous parts
-- without homogeneous order hypothesis
- $ splitEvery m
- $ L.reverse $ L.sortOn _scored_speGen xs
+ $ splitEvery m
+ $ sortWith (Down . _scored_speGen) xs
data Scored t = Scored { _scored_terms :: !t
, _scored_speGen :: !SpecificityGenericity
} deriving (Show)
---coocScored :: Ord t => Map (t,t) Int -> [Scored t]
---coocScored m = zipWith (\(i,t) (inc,spe) -> Scored t inc spe) (M.toList fi) scores
-coocScored :: (DAA.Elt t, Ord t) => Map (t,t) Int -> [Scored t]
-coocScored m = map (\(t,inc,spe) -> Scored t inc spe) scores
+-- TODO in the textflow we end up needing these indices, it might be better
+-- to compute them earlier and pass them around.
+coocScored :: Ord t => Map (t,t) Int -> [Scored t]
+coocScored m = zipWith (\(i,t) (inc,spe) -> Scored t inc spe) (M.toList fi) scores
where
(ti,fi) = createIndices m
(is, ss) = incExcSpeGen $ cooc2mat ti m
- scores = DAA.toList $ DAA.run $ DAA.zip3 (DAA.use ts) (DAA.use is) (DAA.use ss)
- ts = DAA.fromList (DAA.arrayShape is) (snd <$> M.toAscList fi)
- -- TODO fi should already be a Vector
-
-
+ scores = DAA.toList $ DAA.run $ DAA.zip (DAA.use is) (DAA.use ss)
incExcSpeGen_sorted m = both ordonne (incExcSpeGen $ cooc2mat ti m)
where
(ti,fi) = createIndices m
- ordonne x = L.reverse $ L.sortOn snd $ zip (map snd $ M.toList fi) (toList x)
+ ordonne x = sortWith (Down . snd) $ zip (map snd $ M.toList fi) (toList x)
-}
metrics_terms :: IO [[Terms]]
-metrics_terms = mapM (terms MonoMulti EN) $ splitBy (Sentences 0) metrics_text
+metrics_terms = mapM (terms (MonoMulti EN)) $ splitBy (Sentences 0) metrics_text
-- | Occurrences
{-