]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Metrics/Hetero.purs
Merge branch 'dev' into dev-phylo
[gargantext.git] / src / Gargantext / Text / Metrics / Hetero.purs
1 {-|
2 Module : Gargantext.
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Here is a longer description of this module, containing some
11 commentary with @some markup@.
12 -}
13
14 module Gargantext.Text.Hetero where
15
16 import GHC.Real as R
17 import Data.Set as S
18 import Data.Map as M
19 import Data.List.Split as S
20 import Database.PostgreSQL.Simple as PGS
21 import Opaleye.PGTypes (PGInt4)
22 import Opaleye.Internal.Column (Column)
23
24 import Gargantext.Database.Gargandb
25 import Gargantext.Database.Private
26 --import Gargantext.Utils.Chronos
27
28 import Gargantext.Text.Words (cleanText)
29 import Gargantext.Text.Count (occurrences)
30
31 import Gargantext.Database.Simple
32
33 --main = do
34 -- t <- getTextquery
35 -- print (Prelude.map (heterogeinity . concat) $ S.chunksOf 3 t)
36
37 -- heterogeinity sur concat texts
38 heterogeinity' :: Int -> Int -> Int -> IO [Integer]
39 heterogeinity' corpus_id limit x = do
40 t <- getAbstract corpus_id limit
41 Prelude.mapM (dicoStruct . occurrences) $ (S.chunksOf x) . cleanText $ concat t
42
43 heterogeinity'' :: Int -> Int -> Int -> IO [Integer]
44 heterogeinity'' corpus_id limit size = do
45 t <- getAbstract corpus_id limit
46 Prelude.mapM (dicoStruct . occurrences) $ (S.chunksOf size) . cleanText $ concat t
47
48
49 dicoStruct :: (Integral r, Monad m) => M.Map t r -> m r
50 dicoStruct dict_occ = do
51 let keys_size = toInteger $ length $ M.keys dict_occ
52 let total_occ = sum $ Prelude.map (\(x, y) -> y) $ M.toList dict_occ
53 return $ div total_occ (fromIntegral keys_size)
54
55 -- heterogeinity sur UCT (Unité de Context Textuel)
56 heterogeinity :: [Char] -> IO Integer
57 heterogeinity string = do
58 let dict_occ = occurrences $ cleanText string
59
60 let keys_size = toInteger $ length $ M.keys dict_occ
61 let total_occ = sum $ Prelude.map (\(x, y) -> y) $ M.toList dict_occ
62
63 return $ div total_occ (fromIntegral keys_size)
64
65
66 --computeHeterogeinity
67 -- :: Fractional t =>
68 -- Opaleye.Internal.Column.Column Opaleye.PGTypes.PGInt4
69 -- -> IO (t, Integer, Integer)
70 computeHeterogeinity corpus_id = do
71 c <- PGS.connect infoGargandb
72 t <- getText c (nodeHyperdataText corpus_id)
73 heterogeinity $ Prelude.concat t
74
75 main2 = do
76 let corpus_ids = [
77 ("ALL", 272927) -- 73
78 ,("Histoire", 1387736) -- 28
79 ,("Sciences Po", 1296892) -- 37
80 ,("Phylosophie", 1170004) -- 20
81 ,("Psychologie", 1345852) -- 37
82 ,("Sociologie", 1246452) -- 42
83 ]
84
85 r <- Prelude.map computeHeterogeinity $ Prelude.map (\(t,id) -> id) corpus_ids
86 return r
87
88