]> Git — Sourcephile - gargantext.git/blob - src/Data/Gargantext/Ngrams/Hetero_hs
[FEAT/TYPES] Parsers main types.
[gargantext.git] / src / Data / Gargantext / Ngrams / Hetero_hs
1 module Data.Gargantext.Ngrams.Hetero where
2
3 import GHC.Real as R
4 import Data.Set as S
5 import Data.Map as M
6 import Data.List.Split as S
7 import Database.PostgreSQL.Simple as PGS
8 import Opaleye.PGTypes (PGInt4)
9 import Opaleye.Internal.Column (Column)
10
11 import Data.Gargantext.Database.Gargandb
12 import Data.Gargantext.Database.Private
13 --import Data.Gargantext.Utils.Chronos
14
15 import Data.Gargantext.Ngrams.Words (cleanText)
16 import Data.Gargantext.Ngrams.Count (occurrences)
17
18 import Data.Gargantext.Database.Simple
19
20 --main = do
21 -- t <- getTextquery
22 -- print (Prelude.map (heterogeinity . concat) $ S.chunksOf 3 t)
23
24 -- heterogeinity sur concat texts
25 heterogeinity' :: Int -> Int -> Int -> IO [Integer]
26 heterogeinity' corpus_id limit x = do
27 t <- getAbstract corpus_id limit
28 Prelude.mapM (dicoStruct . occurrences) $ (S.chunksOf x) . cleanText $ concat t
29
30 heterogeinity'' :: Int -> Int -> Int -> IO [Integer]
31 heterogeinity'' corpus_id limit size = do
32 t <- getAbstract corpus_id limit
33 Prelude.mapM (dicoStruct . occurrences) $ (S.chunksOf size) . cleanText $ concat t
34
35
36 dicoStruct :: (Integral r, Monad m) => M.Map t r -> m r
37 dicoStruct dict_occ = do
38 let keys_size = toInteger $ length $ M.keys dict_occ
39 let total_occ = sum $ Prelude.map (\(x, y) -> y) $ M.toList dict_occ
40 return $ div total_occ (fromIntegral keys_size)
41
42 -- heterogeinity sur UCT (Unité de Context Textuel)
43 heterogeinity :: [Char] -> IO Integer
44 heterogeinity string = do
45 let dict_occ = occurrences $ cleanText string
46
47 let keys_size = toInteger $ length $ M.keys dict_occ
48 let total_occ = sum $ Prelude.map (\(x, y) -> y) $ M.toList dict_occ
49
50 return $ div total_occ (fromIntegral keys_size)
51
52
53 --computeHeterogeinity
54 -- :: Fractional t =>
55 -- Opaleye.Internal.Column.Column Opaleye.PGTypes.PGInt4
56 -- -> IO (t, Integer, Integer)
57 computeHeterogeinity corpus_id = do
58 c <- PGS.connect infoGargandb
59 t <- getText c (nodeHyperdataText corpus_id)
60 heterogeinity $ Prelude.concat t
61
62 main2 = do
63 let corpus_ids = [
64 ("ALL", 272927) -- 73
65 ,("Histoire", 1387736) -- 28
66 ,("Sciences Po", 1296892) -- 37
67 ,("Phylosophie", 1170004) -- 20
68 ,("Psychologie", 1345852) -- 37
69 ,("Sociologie", 1246452) -- 42
70 ]
71
72 r <- Prelude.map computeHeterogeinity $ Prelude.map (\(t,id) -> id) corpus_ids
73 return r
74
75