]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Metrics/TFICF.hs
[Graph][WIP] Cooc 2 graph and missing file.
[gargantext.git] / src / Gargantext / Database / Metrics / TFICF.hs
1 {-|
2 Module : Gargantext.Database.Metrics.TFICF
3 Description : Ngram connection to the Database
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 TFICF, generalization of TFIDF
11
12 -}
13
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16 {-# LANGUAGE QuasiQuotes #-}
17
18 module Gargantext.Database.Metrics.TFICF where
19
20 import Database.PostgreSQL.Simple.SqlQQ (sql)
21 import qualified Database.PostgreSQL.Simple as DPS
22
23 import Safe (headMay)
24 import Gargantext.Text.Metrics.TFICF -- (tficf)
25 import Gargantext.Prelude
26 import Gargantext.Core.Types.Individu (UsernameMaster)
27 import Gargantext.Database.Utils (Cmd, runPGSQuery)
28 import Gargantext.Database.Types.Node (ListId, CorpusId, NodeType(..))
29 import Gargantext.Database.Config (nodeTypeId)
30 import Gargantext.Database.Schema.Ngrams (NgramsId, NgramsTerms, NgramsType, ngramsTypeId)
31
32 type OccGlobal = Double
33 type OccCorpus = Double
34
35
36 getTficf :: UsernameMaster -> CorpusId -> ListId -> NgramsType
37 -> Cmd err [Tficf]
38 getTficf u cId lId ngType = do
39 g <- getTficfGlobal u
40 c <- getTficfCorpus cId
41 ngs <- getTficfNgrams u cId lId ngType
42
43 pure $ map (\(nId, nTerms, wm, wn)
44 -> Tficf nId nTerms
45 (tficf (TficfCorpus wn (fromIntegral c))
46 (TficfLanguage wm (fromIntegral g))
47 )
48 ) ngs
49
50 getTficfGlobal :: UsernameMaster -> Cmd err Int
51 getTficfGlobal u = maybe 0 identity <$> headMay
52 <$> map (\(DPS.Only n) -> n )
53 <$> runPGSQuery q p
54 where
55 p = (u, nodeTypeId NodeDocument)
56 q = [sql| SELECT count(*) from nodes n
57 JOIN auth_user a ON a.id = n.user_id
58 WHERE
59 a.username = ?
60 AND n.typename = ?
61 |]
62
63 getTficfCorpus :: CorpusId -> Cmd err Int
64 getTficfCorpus cId = maybe 0 identity <$> headMay
65 <$> map (\(DPS.Only n) -> n )
66 <$> runPGSQuery q p
67 where
68 p = (cId, nodeTypeId NodeDocument)
69 q = [sql| WITH input(corpusId, typename) AS ((VALUES(?::"int4",?::"int4")))
70 SELECT count(*) from nodes_nodes AS nn
71 JOIN nodes AS n ON n.id = nn.node2_id
72 JOIN input ON nn.node1_id = input.corpusId
73 WHERE n.typename = input.typename;
74 |]
75
76
77
78 getTficfNgrams :: UsernameMaster -> CorpusId -> ListId -> NgramsType
79 -> Cmd err [(NgramsId, NgramsTerms, OccGlobal, OccCorpus)]
80 getTficfNgrams u cId lId ngType = runPGSQuery queryTficf p
81 where
82 p = (u, nodeTypeId NodeList, nodeTypeId NodeDocument, ngramsTypeId ngType, cId, lId)
83
84
85 queryTficf :: DPS.Query
86 queryTficf = [sql|
87 -- TODO add CTE for groups
88 WITH input(masterUsername,typenameList,typenameDoc,ngramsTypeId,corpusId,listId)
89 AS ((VALUES(?::"text", ? :: "int4", ?::"int4", ?::"int4",?::"int4",?::"int4"))),
90 -- AS ((VALUES('gargantua'::"text", 5 :: "int4", 4::"int4", 4::"int4",1018::"int4",1019::"int4"))),
91
92 list_master AS (
93 SELECT n.id,n.name,n.user_id from nodes n
94 JOIN input ON n.typename = input.typenameList
95 JOIN auth_user a ON a.id = n.user_id
96 WHERE
97 a.username = input.masterUsername
98 ),
99
100 ngrams_master AS (
101 SELECT ng.id, ng.terms, SUM(nng2.weight) AS weight FROM nodes_ngrams nng
102 JOIN list_master ON list_master.id = nng.node_id
103 JOIN nodes_ngrams nng2 ON nng2.ngrams_id = nng.ngrams_id
104 JOIN nodes n ON n.id = nng2.node_id
105 JOIN input ON input.typenameDoc = n.typename
106 JOIN ngrams ng ON ng.id = nng2.ngrams_id
107 WHERE
108 nng.ngrams_type = input.ngramsTypeId
109 -- AND n.hyperdata -> 'lang' = 'en'
110 GROUP BY ng.id,ng.terms
111 ),
112
113 ngrams_user AS (
114 SELECT ng.id, ng.terms, SUM(nng2.weight) AS weight
115 FROM nodes_ngrams nng
116 JOIN list_master ON list_master.id = nng.node_id
117
118 JOIN nodes_ngrams nng2 ON nng2.ngrams_id = nng.ngrams_id
119 JOIN nodes_nodes nn ON nn.node2_id = nng2.node_id
120
121 JOIN ngrams ng ON ng.id = nng2.ngrams_id
122 JOIN input ON nn.node1_id = input.corpusId
123
124 WHERE
125 nng.ngrams_type = input.ngramsTypeId
126 -- AND n.hyperdata -> 'lang' = 'en'
127 GROUP BY ng.id,ng.terms
128 )
129
130
131 SELECT nu.id,nu.terms,SUM(nm.weight) wm,SUM(nu.weight) wu
132 FROM ngrams_user nu
133 JOIN ngrams_master nm ON nm.id = nu.id
134 WHERE
135 nm.weight > 1
136 AND
137 nu.weight > 1
138 GROUP BY nu.id,nu.terms
139 --ORDER BY wm DESC
140 --LIMIT 1000
141
142 |]
143
144
145