]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Metrics/TFICF.hs
Merge branch 'dev-doc-annotation-issue' into dev-textflow
[gargantext.git] / src / Gargantext / Core / Text / Metrics / TFICF.hs
1 {-|
2 Module : Gargantext.Core.Text.Metrics.TFICF
3 Description : TFICF Ngrams tools
4 Copyright : (c) CNRS, 2017
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Definition of TFICF : Term Frequency - Inverse of Context Frequency
11
12 TFICF is a generalization of [TFIDF](https://en.wikipedia.org/wiki/Tf%E2%80%93idf).
13
14 -}
15
16
17 module Gargantext.Core.Text.Metrics.TFICF ( TFICF
18 , TficfContext(..)
19 , Total(..)
20 , Count(..)
21 , tficf
22 , sortTficf
23 )
24 where
25
26 import Data.Text (Text)
27 import Gargantext.Prelude
28 import Data.Set (Set)
29 import Gargantext.Core.Types (Ordering(..))
30 import Data.Map.Strict (Map, toList)
31 import qualified Data.Ord as DO (Down(..))
32 import qualified Data.List as List
33
34 path :: Text
35 path = "[G.T.Metrics.TFICF]"
36
37 type TFICF = Double
38
39 data TficfContext n m = TficfInfra n m
40 | TficfSupra n m
41 deriving (Show)
42
43 data Total = Total {unTotal :: !Double}
44 data Count = Count {unCount :: !Double}
45
46 tficf :: TficfContext Count Total
47 -> TficfContext Count Total
48 -> TFICF
49 tficf (TficfInfra (Count ic) (Total it) )
50 (TficfSupra (Count sc) (Total st) )
51 | it >= ic && st >= sc {-&& it <= st-} = (ic/it) / log (sc/st)
52 | otherwise = panic $ "[ERR]" <> path <>" Frequency impossible"
53 tficf _ _ = panic $ "[ERR]" <> path <> "Undefined for these contexts"
54
55
56 sortTficf :: Ordering
57 -> (Map Text (Double, Set Text))
58 -> [ (Text,(Double, Set Text))]
59 sortTficf Down = List.sortOn (DO.Down . fst . snd) . toList
60 sortTficf Up = List.sortOn (fst . snd) . toList
61