2 Module : Gargantext.Database.Metrics.TFICF
3 Description : Ngram connection to the Database
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 TFICF, generalization of TFIDF
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16 {-# LANGUAGE QuasiQuotes #-}
18 module Gargantext.Database.Metrics.TFICF where
20 import Database.PostgreSQL.Simple.SqlQQ (sql)
21 import qualified Database.PostgreSQL.Simple as DPS
24 import Gargantext.Text.Metrics.TFICF -- (tficf)
25 import Gargantext.Prelude
26 import Gargantext.Core.Types.Individu (UsernameMaster)
27 import Gargantext.Database.Utils (Cmd, runPGSQuery)
28 import Gargantext.Database.Types.Node (ListId, CorpusId, NodeType(..))
29 import Gargantext.Database.Config (nodeTypeId)
30 import Gargantext.Database.Schema.Ngrams (NgramsId, NgramsTerms, NgramsType, ngramsTypeId)
32 type OccGlobal = Double
33 type OccCorpus = Double
36 getTficf :: UsernameMaster -> CorpusId -> ListId -> NgramsType
38 getTficf u cId lId ngType = do
40 c <- getTficfCorpus cId
41 ngs <- getTficfNgrams u cId lId ngType
43 pure $ map (\(nId, nTerms, wm, wn)
45 (tficf (TficfCorpus wn (fromIntegral c))
46 (TficfLanguage wm (fromIntegral g))
50 getTficfGlobal :: UsernameMaster -> Cmd err Int
51 getTficfGlobal u = maybe 0 identity <$> headMay
52 <$> map (\(DPS.Only n) -> n )
55 p = (u, nodeTypeId NodeDocument)
56 q = [sql| SELECT count(*) from nodes n
57 JOIN auth_user a ON a.id = n.user_id
63 getTficfCorpus :: CorpusId -> Cmd err Int
64 getTficfCorpus cId = maybe 0 identity <$> headMay
65 <$> map (\(DPS.Only n) -> n )
68 p = (cId, nodeTypeId NodeDocument)
69 q = [sql| WITH input(corpusId, typename) AS ((VALUES(?::"int4",?::"int4")))
70 SELECT count(*) from nodes_nodes AS nn
71 JOIN nodes AS n ON n.id = nn.node2_id
72 JOIN input ON nn.node1_id = input.corpusId
73 WHERE n.typename = input.typename;
78 getTficfNgrams :: UsernameMaster -> CorpusId -> ListId -> NgramsType
79 -> Cmd err [(NgramsId, NgramsTerms, OccGlobal, OccCorpus)]
80 getTficfNgrams u cId lId ngType = runPGSQuery queryTficf p
82 p = (u, nodeTypeId NodeList, nodeTypeId NodeDocument, ngramsTypeId ngType, cId, lId)
85 queryTficf :: DPS.Query
87 -- TODO add CTE for groups
88 WITH input(masterUsername,typenameList,typenameDoc,ngramsTypeId,corpusId,listId)
89 AS ((VALUES(?::"text", ? :: "int4", ?::"int4", ?::"int4",?::"int4",?::"int4"))),
90 -- AS ((VALUES('gargantua'::"text", 5 :: "int4", 4::"int4", 4::"int4",1018::"int4",1019::"int4"))),
93 SELECT n.id,n.name,n.user_id from nodes n
94 JOIN input ON n.typename = input.typenameList
95 JOIN auth_user a ON a.id = n.user_id
97 a.username = input.masterUsername
101 SELECT ng.id, ng.terms, SUM(nng2.weight) AS weight FROM nodes_ngrams nng
102 JOIN list_master ON list_master.id = nng.node_id
103 JOIN nodes_ngrams nng2 ON nng2.ngrams_id = nng.ngrams_id
104 JOIN nodes n ON n.id = nng2.node_id
105 JOIN input ON input.typenameDoc = n.typename
106 JOIN ngrams ng ON ng.id = nng2.ngrams_id
108 nng.ngrams_type = input.ngramsTypeId
109 -- AND n.hyperdata -> 'lang' = 'en'
110 GROUP BY ng.id,ng.terms
114 SELECT ng.id, ng.terms, SUM(nng2.weight) AS weight
115 FROM nodes_ngrams nng
116 JOIN list_master ON list_master.id = nng.node_id
118 JOIN nodes_ngrams nng2 ON nng2.ngrams_id = nng.ngrams_id
119 JOIN nodes_nodes nn ON nn.node2_id = nng2.node_id
121 JOIN ngrams ng ON ng.id = nng2.ngrams_id
122 JOIN input ON nn.node1_id = input.corpusId
125 nng.ngrams_type = input.ngramsTypeId
126 -- AND n.hyperdata -> 'lang' = 'en'
127 GROUP BY ng.id,ng.terms
131 SELECT nu.id,nu.terms,SUM(nm.weight) wm,SUM(nu.weight) wu
133 JOIN ngrams_master nm ON nm.id = nu.id
138 GROUP BY nu.id,nu.terms