]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Ext/IMT.hs
[NGRAMS] fix entropy.
[gargantext.git] / src / Gargantext / Ext / IMT.hs
1 {-|
2 Module : Gargantext.API
3 Description : Server API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11 {-# LANGUAGE NoImplicitPrelude #-}
12 {-# LANGUAGE OverloadedStrings #-}
13
14 module Gargantext.Ext.IMT where
15
16 import Gargantext.Prelude
17 import Data.Text (Text, splitOn)
18 import Data.Map (Map)
19
20 import qualified Data.Set as S
21 import qualified Data.List as DL
22 import qualified Data.Vector as DV
23 import qualified Data.Map as M
24
25 import Gargantext.Text.Metrics.Freq as F
26 import Gargantext.Text.Parsers.CSV as CSV
27
28 data School = School { school_shortName :: Text
29 , school_longName :: Text
30 , school_id :: Text
31 } deriving (Show, Read, Eq)
32
33 schools :: [School]
34 schools = [ School
35 ("Mines Albi-Carmaux")
36 ("Mines Albi-Carmaux - École nationale supérieure des Mines d'Albi‐Carmaux")
37 ("469216")
38 , School
39 ("Mines Alès")
40 ("EMA - École des Mines d'Alès")
41 ("6279")
42 , School
43 ("Mines Douai")
44 ("Mines Douai EMD - École des Mines de Douai")
45 ("224096")
46 , School
47 ("Mines Lille")
48 ("Mines Lille - École des Mines de Lille")
49 ("144103")
50 , School
51 ("IMT Lille Douai")
52 ("IMT Lille Douai")
53 ("497330")
54 , School
55 ("Mines Nantes")
56 ("Mines Nantes - Mines Nantes")
57 ("84538")
58 , School
59 ("Télécom Bretagne")
60 ("Télécom Bretagne")
61 ("301262")
62 , School
63 ("IMT Atlantique")
64 ("IMT Atlantique - IMT Atlantique Bretagne-Pays de la Loire")
65 ("481355")
66 , School
67 ("Mines Saint-Étienne")
68 ("Mines Saint-Étienne MSE - École des Mines de Saint-Étienne")
69 ("29212")
70 , School
71 ("Télécom École de Management")
72 ("TEM - Télécom Ecole de Management")
73 ("301442")
74 , School
75 ("IMT Business School")
76 ("IMT Business School")
77 ("542824")
78 , School
79 ("Télécom ParisTech")
80 ("Télécom ParisTech")
81 ("300362")
82 , School
83 ("Télécom SudParis")
84 ("TSP - Télécom SudParis")
85 ("352124")
86 , School
87 ("ARMINES")
88 ("ARMINES")
89 ("300362")
90 , School
91 ("Eurecom")
92 ("Eurecom")
93 ("421532")
94 , School
95 ("Mines ParisTech")
96 ("MINES ParisTech - École nationale supérieure des mines de Paris")
97 ("301492")
98 ]
99
100 mapIdSchool :: Map Text Text
101 mapIdSchool = M.fromList $ Gargantext.Prelude.map (\(School n _ i) -> (i,n)) schools
102
103 hal_data :: IO (DV.Vector CsvHal)
104 hal_data = snd <$> CSV.readCsvHal "doc/corpus_imt/Gargantext_Corpus.csv"
105
106 names :: S.Set Text
107 names = S.fromList $ Gargantext.Prelude.map (\s -> school_id s) schools
108
109 toSchoolName :: Text -> Text
110 toSchoolName t = case M.lookup t mapIdSchool of
111 Nothing -> t
112 Just t' -> t'
113
114 publisBySchool :: DV.Vector CsvHal -> [(Maybe Text, Int)]
115 publisBySchool hal_data' = Gargantext.Prelude.map (\(i,n) -> (M.lookup i mapIdSchool, n))
116 $ DL.filter (\i -> S.member (fst i) names)
117 $ DL.reverse
118 $ DL.sortOn snd
119 $ M.toList
120 $ F.freq
121 $ DL.concat
122 $ DV.toList
123 $ DV.map (\n -> splitOn ( ", ") (csvHal_instStructId_i n) )
124 $ DV.filter (\n -> csvHal_publication_year n == 2017) hal_data'
125
126