]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Metrics/TFICF.hs
[openalex] first attempt on integration this
[gargantext.git] / src / Gargantext / Core / Text / Metrics / TFICF.hs
1 {-|
2 Module : Gargantext.Core.Text.Metrics.TFICF
3 Description : TFICF Ngrams tools
4 Copyright : (c) CNRS, 2017
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Definition of TFICF : Term Frequency - Inverse of Context Frequency
11
12 TFICF is a generalization of [TFIDF](https://en.wikipedia.org/wiki/Tf%E2%80%93idf).
13
14 -}
15
16
17 module Gargantext.Core.Text.Metrics.TFICF ( TFICF
18 , TficfContext(..)
19 , Total(..)
20 , Count(..)
21 , tficf
22 , sortTficf
23 )
24 where
25
26 import Data.Map.Strict (Map, toList)
27 import Data.Text (Text)
28 import Gargantext.Core.Types (Ordering(..))
29 import Gargantext.Prelude
30 import qualified Data.List as List
31 import qualified Data.Ord as DO (Down(..))
32
33 path :: Text
34 path = "[G.T.Metrics.TFICF]"
35
36 type TFICF = Double
37
38 data TficfContext n m = TficfInfra n m
39 | TficfSupra n m
40 deriving (Show)
41
42 data Total = Total {unTotal :: !Double}
43 data Count = Count {unCount :: !Double}
44
45 tficf :: TficfContext Count Total
46 -> TficfContext Count Total
47 -> TFICF
48 tficf (TficfInfra (Count ic) (Total it) )
49 (TficfSupra (Count sc) (Total st) )
50 | it >= ic && st >= sc && it <= st = (it/ic) * log (st/sc)
51 | otherwise = panic
52 $ "[ERR]"
53 <> path
54 <> " Frequency impossible"
55 tficf _ _ = panic $ "[ERR]" <> path <> "Undefined for these contexts"
56
57
58 sortTficf :: Ordering
59 -> Map Text Double
60 -> [(Text, Double)]
61 sortTficf Down = List.sortOn (DO.Down . snd) . toList
62 sortTficf Up = List.sortOn snd . toList
63