2 Module : Gargantext.Database.Metrics.TFICF
3 Description : Ngram connection to the Database
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 TFICF, generalization of TFIDF
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16 {-# LANGUAGE QuasiQuotes #-}
17 {-# LANGUAGE RankNTypes #-}
19 module Gargantext.Database.Metrics.TFICF where
21 import Database.PostgreSQL.Simple.SqlQQ (sql)
22 import qualified Database.PostgreSQL.Simple as DPS
25 import Gargantext.Text.Metrics.TFICF -- (tficf)
26 import Gargantext.Prelude
27 import Gargantext.Core.Types.Individu (UsernameMaster)
28 import Gargantext.Database.Utils (Cmd, runPGSQuery)
29 import Gargantext.Database.Types.Node (ListId, CorpusId, NodeType(..))
30 import Gargantext.Database.Config (nodeTypeId)
31 import Gargantext.Database.Schema.Ngrams (NgramsId, NgramsTerms, NgramsType, ngramsTypeId)
33 type OccGlobal = Double
34 type OccCorpus = Double
37 getTficf :: UsernameMaster -> CorpusId -> ListId -> NgramsType
39 getTficf u cId lId ngType = do
41 c <- getTficfCorpus cId
42 ngs <- getTficfNgrams u cId lId ngType
44 pure $ map (\(nId, nTerms, wm, wn)
46 (tficf (TficfCorpus wn (fromIntegral c))
47 (TficfLanguage wm (fromIntegral g))
51 getTficfGlobal :: UsernameMaster -> Cmd err Int
52 getTficfGlobal u = maybe 0 identity <$> headMay
53 <$> map (\(DPS.Only n) -> n )
56 p = (u, nodeTypeId NodeDocument)
57 q = [sql| SELECT count(*) from nodes n
58 JOIN auth_user a ON a.id = n.user_id
64 getTficfCorpus :: CorpusId -> Cmd err Int
65 getTficfCorpus cId = maybe 0 identity <$> headMay
66 <$> map (\(DPS.Only n) -> n )
69 p = (cId, nodeTypeId NodeDocument)
70 q = [sql| WITH input(corpusId, typename) AS ((VALUES(?::"int4",?::"int4")))
71 SELECT count(*) from nodes_nodes AS nn
72 JOIN nodes AS n ON n.id = nn.node2_id
73 JOIN input ON nn.node1_id = input.corpusId
74 WHERE n.typename = input.typename;
79 getTficfNgrams :: UsernameMaster -> CorpusId -> ListId -> NgramsType
80 -> Cmd err [(NgramsId, NgramsTerms, OccGlobal, OccCorpus)]
81 getTficfNgrams u cId lId ngType = runPGSQuery queryTficf p
83 p = (u, nodeTypeId NodeList, nodeTypeId NodeDocument, ngramsTypeId ngType, cId, lId)
86 queryTficf :: DPS.Query
88 -- TODO add CTE for groups
89 WITH input(masterUsername,typenameList,typenameDoc,ngramsTypeId,corpusId,listId)
90 AS ((VALUES(?::"text", ? :: "int4", ?::"int4", ?::"int4",?::"int4",?::"int4"))),
91 -- AS ((VALUES('gargantua'::"text", 5 :: "int4", 4::"int4", 4::"int4",1018::"int4",1019::"int4"))),
94 SELECT n.id,n.name,n.user_id from nodes n
95 JOIN input ON n.typename = input.typenameList
96 JOIN auth_user a ON a.id = n.user_id
98 a.username = input.masterUsername
102 SELECT ng.id, ng.terms, SUM(nng2.weight) AS weight FROM nodes_ngrams nng
103 JOIN list_master ON list_master.id = nng.node_id
104 JOIN nodes_ngrams nng2 ON nng2.ngrams_id = nng.ngrams_id
105 JOIN nodes n ON n.id = nng2.node_id
106 JOIN input ON input.typenameDoc = n.typename
107 JOIN ngrams ng ON ng.id = nng2.ngrams_id
109 nng.ngrams_type = input.ngramsTypeId
110 -- AND n.hyperdata -> 'lang' = 'en'
111 GROUP BY ng.id,ng.terms
115 SELECT ng.id, ng.terms, SUM(nng2.weight) AS weight
116 FROM nodes_ngrams nng
117 JOIN list_master ON list_master.id = nng.node_id
119 JOIN nodes_ngrams nng2 ON nng2.ngrams_id = nng.ngrams_id
120 JOIN nodes_nodes nn ON nn.node2_id = nng2.node_id
122 JOIN ngrams ng ON ng.id = nng2.ngrams_id
123 JOIN input ON nn.node1_id = input.corpusId
126 nng.ngrams_type = input.ngramsTypeId
127 -- AND n.hyperdata -> 'lang' = 'en'
128 GROUP BY ng.id,ng.terms
132 SELECT nu.id,nu.terms,SUM(nm.weight) wm,SUM(nu.weight) wu
134 JOIN ngrams_master nm ON nm.id = nu.id
139 GROUP BY nu.id,nu.terms