]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Metrics/TFICF.hs
Add the clustering up to level 2 and more
[gargantext.git] / src / Gargantext / Database / Metrics / TFICF.hs
1 {-|
2 Module : Gargantext.Database.Metrics.TFICF
3 Description : Ngram connection to the Database
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 TFICF, generalization of TFIDF
11
12 -}
13
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16 {-# LANGUAGE QuasiQuotes #-}
17 {-# LANGUAGE RankNTypes #-}
18
19 module Gargantext.Database.Metrics.TFICF where
20
21 import Database.PostgreSQL.Simple.SqlQQ (sql)
22 import qualified Database.PostgreSQL.Simple as DPS
23
24 import Safe (headMay)
25 import Gargantext.Text.Metrics.TFICF -- (tficf)
26 import Gargantext.Prelude
27 import Gargantext.Core.Types.Individu (UsernameMaster)
28 import Gargantext.Database.Utils (Cmd, runPGSQuery)
29 import Gargantext.Database.Types.Node (ListId, CorpusId, NodeType(..))
30 import Gargantext.Database.Config (nodeTypeId)
31 import Gargantext.Database.Schema.Ngrams (NgramsId, NgramsTerms, NgramsType, ngramsTypeId)
32
33 type OccGlobal = Double
34 type OccCorpus = Double
35
36
37 getTficf :: UsernameMaster -> CorpusId -> ListId -> NgramsType
38 -> Cmd err [Tficf]
39 getTficf u cId lId ngType = do
40 g <- getTficfGlobal u
41 c <- getTficfCorpus cId
42 ngs <- getTficfNgrams u cId lId ngType
43
44 pure $ map (\(nId, nTerms, wm, wn)
45 -> Tficf nId nTerms
46 (tficf (TficfCorpus wn (fromIntegral c))
47 (TficfLanguage wm (fromIntegral g))
48 )
49 ) ngs
50
51 getTficfGlobal :: UsernameMaster -> Cmd err Int
52 getTficfGlobal u = maybe 0 identity <$> headMay
53 <$> map (\(DPS.Only n) -> n )
54 <$> runPGSQuery q p
55 where
56 p = (u, nodeTypeId NodeDocument)
57 q = [sql| SELECT count(*) from nodes n
58 JOIN auth_user a ON a.id = n.user_id
59 WHERE
60 a.username = ?
61 AND n.typename = ?
62 |]
63
64 getTficfCorpus :: CorpusId -> Cmd err Int
65 getTficfCorpus cId = maybe 0 identity <$> headMay
66 <$> map (\(DPS.Only n) -> n )
67 <$> runPGSQuery q p
68 where
69 p = (cId, nodeTypeId NodeDocument)
70 q = [sql| WITH input(corpusId, typename) AS ((VALUES(?::"int4",?::"int4")))
71 SELECT count(*) from nodes_nodes AS nn
72 JOIN nodes AS n ON n.id = nn.node2_id
73 JOIN input ON nn.node1_id = input.corpusId
74 WHERE n.typename = input.typename;
75 |]
76
77
78
79 getTficfNgrams :: UsernameMaster -> CorpusId -> ListId -> NgramsType
80 -> Cmd err [(NgramsId, NgramsTerms, OccGlobal, OccCorpus)]
81 getTficfNgrams u cId lId ngType = runPGSQuery queryTficf p
82 where
83 p = (u, nodeTypeId NodeList, nodeTypeId NodeDocument, ngramsTypeId ngType, cId, lId)
84
85
86 queryTficf :: DPS.Query
87 queryTficf = [sql|
88 -- TODO add CTE for groups
89 WITH input(masterUsername,typenameList,typenameDoc,ngramsTypeId,corpusId,listId)
90 AS ((VALUES(?::"text", ? :: "int4", ?::"int4", ?::"int4",?::"int4",?::"int4"))),
91 -- AS ((VALUES('gargantua'::"text", 5 :: "int4", 4::"int4", 4::"int4",1018::"int4",1019::"int4"))),
92
93 list_master AS (
94 SELECT n.id,n.name,n.user_id from nodes n
95 JOIN input ON n.typename = input.typenameList
96 JOIN auth_user a ON a.id = n.user_id
97 WHERE
98 a.username = input.masterUsername
99 ),
100
101 ngrams_master AS (
102 SELECT ng.id, ng.terms, SUM(nng2.weight) AS weight FROM nodes_ngrams nng
103 JOIN list_master ON list_master.id = nng.node_id
104 JOIN nodes_ngrams nng2 ON nng2.ngrams_id = nng.ngrams_id
105 JOIN nodes n ON n.id = nng2.node_id
106 JOIN input ON input.typenameDoc = n.typename
107 JOIN ngrams ng ON ng.id = nng2.ngrams_id
108 WHERE
109 nng.ngrams_type = input.ngramsTypeId
110 -- AND n.hyperdata -> 'lang' = 'en'
111 GROUP BY ng.id,ng.terms
112 ),
113
114 ngrams_user AS (
115 SELECT ng.id, ng.terms, SUM(nng2.weight) AS weight
116 FROM nodes_ngrams nng
117 JOIN list_master ON list_master.id = nng.node_id
118
119 JOIN nodes_ngrams nng2 ON nng2.ngrams_id = nng.ngrams_id
120 JOIN nodes_nodes nn ON nn.node2_id = nng2.node_id
121
122 JOIN ngrams ng ON ng.id = nng2.ngrams_id
123 JOIN input ON nn.node1_id = input.corpusId
124
125 WHERE
126 nng.ngrams_type = input.ngramsTypeId
127 -- AND n.hyperdata -> 'lang' = 'en'
128 GROUP BY ng.id,ng.terms
129 )
130
131
132 SELECT nu.id,nu.terms,SUM(nm.weight) wm,SUM(nu.weight) wu
133 FROM ngrams_user nu
134 JOIN ngrams_master nm ON nm.id = nu.id
135 WHERE
136 nm.weight > 1
137 AND
138 nu.weight > 1
139 GROUP BY nu.id,nu.terms
140 --ORDER BY wm DESC
141 --LIMIT 1000
142
143 |]
144
145