1 module Data.Gargantext.Ngrams.Hetero where
6 import Data.List.Split as S
7 import Database.PostgreSQL.Simple as PGS
8 import Opaleye.PGTypes (PGInt4)
9 import Opaleye.Internal.Column (Column)
11 import Data.Gargantext.Database.Gargandb
12 import Data.Gargantext.Database.Private
13 --import Data.Gargantext.Utils.Chronos
15 import Data.Gargantext.Ngrams.Words (cleanText)
16 import Data.Gargantext.Ngrams.Count (occurrences)
18 import Data.Gargantext.Database.Simple
22 -- print (Prelude.map (heterogeinity . concat) $ S.chunksOf 3 t)
24 -- heterogeinity sur concat texts
25 heterogeinity' :: Int -> Int -> Int -> IO [Integer]
26 heterogeinity' corpus_id limit x = do
27 t <- getAbstract corpus_id limit
28 Prelude.mapM (dicoStruct . occurrences) $ (S.chunksOf x) . cleanText $ concat t
30 heterogeinity'' :: Int -> Int -> Int -> IO [Integer]
31 heterogeinity'' corpus_id limit size = do
32 t <- getAbstract corpus_id limit
33 Prelude.mapM (dicoStruct . occurrences) $ (S.chunksOf size) . cleanText $ concat t
36 dicoStruct :: (Integral r, Monad m) => M.Map t r -> m r
37 dicoStruct dict_occ = do
38 let keys_size = toInteger $ length $ M.keys dict_occ
39 let total_occ = sum $ Prelude.map (\(x, y) -> y) $ M.toList dict_occ
40 return $ div total_occ (fromIntegral keys_size)
42 -- heterogeinity sur UCT (Unité de Context Textuel)
43 heterogeinity :: [Char] -> IO Integer
44 heterogeinity string = do
45 let dict_occ = occurrences $ cleanText string
47 let keys_size = toInteger $ length $ M.keys dict_occ
48 let total_occ = sum $ Prelude.map (\(x, y) -> y) $ M.toList dict_occ
50 return $ div total_occ (fromIntegral keys_size)
53 --computeHeterogeinity
55 -- Opaleye.Internal.Column.Column Opaleye.PGTypes.PGInt4
56 -- -> IO (t, Integer, Integer)
57 computeHeterogeinity corpus_id = do
58 c <- PGS.connect infoGargandb
59 t <- getText c (nodeHyperdataText corpus_id)
60 heterogeinity $ Prelude.concat t
65 ,("Histoire", 1387736) -- 28
66 ,("Sciences Po", 1296892) -- 37
67 ,("Phylosophie", 1170004) -- 20
68 ,("Psychologie", 1345852) -- 37
69 ,("Sociologie", 1246452) -- 42
72 r <- Prelude.map computeHeterogeinity $ Prelude.map (\(t,id) -> id) corpus_ids