module Gargantext.Database.Action.Metrics.NgramsByNode
where
-import Data.Map.Strict (Map, fromListWith, elems, toList, fromList)
-import Data.Map.Strict.Patch (PatchMap, Replace, diff)
+--import Data.Map.Strict.Patch (PatchMap, Replace, diff)
+import Data.HashMap.Strict (HashMap)
+import Data.Map (Map)
import Data.Set (Set)
import Data.Text (Text)
-import Data.Tuple.Extra (second, swap)
+import Data.Tuple.Extra (first, second, swap)
import Database.PostgreSQL.Simple.SqlQQ (sql)
import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
import Debug.Trace (trace)
-import Gargantext.Core (Lang(..))
-import Gargantext.Database.Admin.Config (nodeTypeId)
+import Gargantext.Core
+import Gargantext.API.Ngrams.Types (NgramsTerm(..))
+import Gargantext.Data.HashMap.Strict.Utils as HM
import Gargantext.Database.Admin.Types.Node -- (ListId, CorpusId, NodeId)
import Gargantext.Database.Prelude (Cmd, runPGSQuery)
import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
import Gargantext.Prelude
-import Gargantext.Text.Metrics.TFICF
-import Gargantext.Text.Terms.Mono.Stem (stem)
-import qualified Data.List as List
-import qualified Data.Map.Strict as Map
+import qualified Data.HashMap.Strict as HM
+import qualified Data.Map as Map
import qualified Data.Set as Set
-import qualified Data.Text as Text
import qualified Database.PostgreSQL.Simple as DPS
--- | TODO: group with 2 terms only can be
--- discussed. Main purpose of this is offering
--- a first grouping option to user and get some
--- enriched data to better learn and improve that algo
-ngramsGroup :: Lang
- -> Int
- -> Int
- -> Text
- -> Text
-ngramsGroup l _m _n = Text.intercalate " "
- . map (stem l)
- -- . take n
- . List.sort
- -- . (List.filter (\t -> Text.length t > m))
- . Text.splitOn " "
- . Text.replace "-" " "
-
-
-
-getTficf :: UserCorpusId
- -> MasterCorpusId
- -> NgramsType
- -> (Text -> Text)
- -> Cmd err (Map Text (Double, Set Text))
-getTficf u m nt f = do
- u' <- Map.filter (\s -> Set.size s > 1) <$> getNodesByNgramsUser u nt
- m' <- Map.filter (\s -> Set.size s > 1) <$> getNodesByNgramsMaster u m
-
- pure $ toTficfData (countNodesByNgramsWith f u')
- (countNodesByNgramsWith f m')
-
-{-
-getTficfWith :: UserCorpusId
- -> MasterCorpusId
- -> [ListId]
- -> NgramsType
- -> Map Text (Maybe Text)
- -> Cmd err (Map Text (Double, Set Text))
-getTficfWith u m ls nt mtxt = do
- u' <- getNodesByNgramsOnlyUser u ls nt (Map.keys mtxt)
- m' <- getNodesByNgramsMaster u m
-
- let f x = case Map.lookup x mtxt of
- Nothing -> x
- Just x' -> maybe x identity x'
-
- pure $ toTficfData (countNodesByNgramsWith f u') (countNodesByNgramsWith f m')
--}
-
-type Context = (Double, Map Text (Double, Set Text))
-type Supra = Context
-type Infra = Context
-
-toTficfData :: Infra
- -> Supra
- -> Map Text (Double, Set Text)
-toTficfData (ti, mi) (ts, ms) =
- fromList [ (t, ( tficf (TficfInfra (Count n )(Total ti))
- (TficfSupra (Count $ maybe 0 fst $ Map.lookup t ms)(Total ts))
- , ns
- )
- )
- | (t, (n,ns)) <- toList mi
- ]
-
-
-- | fst is size of Supra Corpus
-- snd is Texts and size of Occurrences (different docs)
-countNodesByNgramsWith :: (Text -> Text)
- -> Map Text (Set NodeId)
- -> (Double, Map Text (Double, Set Text))
+countNodesByNgramsWith :: (NgramsTerm -> NgramsTerm)
+ -> HashMap NgramsTerm (Set NodeId)
+ -> (Double, HashMap NgramsTerm (Double, Set NgramsTerm))
countNodesByNgramsWith f m = (total, m')
where
- total = fromIntegral $ Set.size $ Set.unions $ elems m
- m' = Map.map ( swap . second (fromIntegral . Set.size))
- $ groupNodesByNgramsWith f m
+ total = fromIntegral $ Set.size $ Set.unions $ HM.elems m
+ m' = HM.map ( swap . second (fromIntegral . Set.size))
+ $ groupNodesByNgramsWith f m
-groupNodesByNgramsWith :: (Text -> Text)
- -> Map Text (Set NodeId)
- -> Map Text (Set Text, Set NodeId)
+groupNodesByNgramsWith :: (NgramsTerm -> NgramsTerm)
+ -> HashMap NgramsTerm (Set NodeId)
+ -> HashMap NgramsTerm (Set NgramsTerm, Set NodeId)
groupNodesByNgramsWith f m =
- fromListWith (<>) $ map (\(t,ns) -> (f t, (Set.singleton t, ns)))
- $ toList m
+ HM.fromListWith (<>) $ map (\(t,ns) -> (f t, (Set.singleton t, ns)))
+ $ HM.toList m
------------------------------------------------------------------------
-getNodesByNgramsUser :: CorpusId
+getNodesByNgramsUser :: HasDBid NodeType
+ => CorpusId
-> NgramsType
- -> Cmd err (Map Text (Set NodeId))
+ -> Cmd err (HashMap NgramsTerm (Set NodeId))
getNodesByNgramsUser cId nt =
- fromListWith (<>) <$> map (\(n,t) -> (t, Set.singleton n))
+ HM.fromListWith (<>) <$> map (\(n,t) -> (NgramsTerm t, Set.singleton n))
<$> selectNgramsByNodeUser cId nt
where
- selectNgramsByNodeUser :: CorpusId
+ selectNgramsByNodeUser :: HasDBid NodeType
+ => CorpusId
-> NgramsType
-> Cmd err [(NodeId, Text)]
selectNgramsByNodeUser cId' nt' =
runPGSQuery queryNgramsByNodeUser
( cId'
- , nodeTypeId NodeDocument
+ , toDBid NodeDocument
, ngramsTypeId nt'
-- , 100 :: Int -- limit
-- , 0 :: Int -- offset
JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
JOIN nodes n ON nn.node2_id = n.id
WHERE nn.node1_id = ? -- CorpusId
- AND n.typename = ? -- NodeTypeId
+ AND n.typename = ? -- toDBid
AND nng.ngrams_type = ? -- NgramsTypeId
AND nn.category > 0
GROUP BY nng.node2_id, ng.terms
|]
------------------------------------------------------------------------
-- TODO add groups
-getOccByNgramsOnlyFast :: CorpusId
+getOccByNgramsOnlyFast :: HasDBid NodeType
+ => CorpusId
-> NgramsType
- -> [Text]
- -> Cmd err (Map Text Int)
+ -> [NgramsTerm]
+ -> Cmd err (HashMap NgramsTerm Int)
getOccByNgramsOnlyFast cId nt ngs =
- fromListWith (+) <$> selectNgramsOccurrencesOnlyByNodeUser cId nt ngs
+ HM.fromListWith (+) <$> selectNgramsOccurrencesOnlyByNodeUser cId nt ngs
+
+
+getOccByNgramsOnlyFast_withSample :: HasDBid NodeType
+ => CorpusId
+ -> Int
+ -> NgramsType
+ -> [NgramsTerm]
+ -> Cmd err (HashMap NgramsTerm Int)
+getOccByNgramsOnlyFast_withSample cId int nt ngs =
+ HM.fromListWith (+) <$> selectNgramsOccurrencesOnlyByNodeUser_withSample cId int nt ngs
+
+
getOccByNgramsOnlyFast' :: CorpusId
-> ListId
-> NgramsType
- -> [Text]
- -> Cmd err (Map Text Int)
+ -> [NgramsTerm]
+ -> Cmd err (HashMap NgramsTerm Int)
getOccByNgramsOnlyFast' cId lId nt tms = trace (show (cId, lId)) $
- fromListWith (+) <$> map (second round) <$> run cId lId nt tms
+ HM.fromListWith (+) <$> map (second round) <$> run cId lId nt tms
where
fields = [QualifiedIdentifier Nothing "text"]
run :: CorpusId
-> ListId
-> NgramsType
- -> [Text]
- -> Cmd err [(Text, Double)]
- run cId' lId' nt' tms' = runPGSQuery query
- ( Values fields (DPS.Only <$> tms')
+ -> [NgramsTerm]
+ -> Cmd err [(NgramsTerm, Double)]
+ run cId' lId' nt' tms' = fmap (first NgramsTerm) <$> runPGSQuery query
+ ( Values fields ((DPS.Only . unNgramsTerm) <$> tms')
, cId'
, lId'
, ngramsTypeId nt'
-- just slower than getOccByNgramsOnlyFast
-getOccByNgramsOnlySlow :: NodeType
+getOccByNgramsOnlySlow :: HasDBid NodeType
+ => NodeType
-> CorpusId
-> [ListId]
-> NgramsType
- -> [Text]
- -> Cmd err (Map Text Int)
+ -> [NgramsTerm]
+ -> Cmd err (HashMap NgramsTerm Int)
getOccByNgramsOnlySlow t cId ls nt ngs =
- Map.map Set.size <$> getScore' t cId ls nt ngs
+ HM.map Set.size <$> getScore' t cId ls nt ngs
where
getScore' NodeCorpus = getNodesByNgramsOnlyUser
getScore' NodeDocument = getNgramsByDocOnlyUser
getScore' _ = getNodesByNgramsOnlyUser
-getOccByNgramsOnlySafe :: CorpusId
+getOccByNgramsOnlySafe :: HasDBid NodeType
+ => CorpusId
-> [ListId]
-> NgramsType
- -> [Text]
- -> Cmd err (Map Text Int)
+ -> [NgramsTerm]
+ -> Cmd err (HashMap NgramsTerm Int)
getOccByNgramsOnlySafe cId ls nt ngs = do
printDebug "getOccByNgramsOnlySafe" (cId, nt, length ngs)
fast <- getOccByNgramsOnlyFast cId nt ngs
slow <- getOccByNgramsOnlySlow NodeCorpus cId ls nt ngs
when (fast /= slow) $
printDebug "getOccByNgramsOnlySafe: difference"
- (diff slow fast :: PatchMap Text (Replace (Maybe Int)))
+ (HM.difference slow fast, HM.difference fast slow)
+ -- diff slow fast :: PatchMap Text (Replace (Maybe Int))
pure slow
-selectNgramsOccurrencesOnlyByNodeUser :: CorpusId
+selectNgramsOccurrencesOnlyByNodeUser :: HasDBid NodeType
+ => CorpusId
-> NgramsType
- -> [Text]
- -> Cmd err [(Text, Int)]
+ -> [NgramsTerm]
+ -> Cmd err [(NgramsTerm, Int)]
selectNgramsOccurrencesOnlyByNodeUser cId nt tms =
+ fmap (first NgramsTerm) <$>
runPGSQuery queryNgramsOccurrencesOnlyByNodeUser
- ( Values fields (DPS.Only <$> tms)
+ ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
, cId
- , nodeTypeId NodeDocument
+ , toDBid NodeDocument
, ngramsTypeId nt
)
where
fields = [QualifiedIdentifier Nothing "text"]
+
+
-- same as queryNgramsOnlyByNodeUser but using COUNT on the node ids.
-- Question: with the grouping is the result exactly the same (since Set NodeId for
-- equivalent ngrams intersections are not empty)
JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
JOIN nodes n ON nn.node2_id = n.id
WHERE nn.node1_id = ? -- CorpusId
- AND n.typename = ? -- NodeTypeId
+ AND n.typename = ? -- toDBid
AND nng.ngrams_type = ? -- NgramsTypeId
AND nn.category > 0
GROUP BY nng.node2_id, ng.terms
|]
+
+selectNgramsOccurrencesOnlyByNodeUser_withSample :: HasDBid NodeType
+ => CorpusId
+ -> Int
+ -> NgramsType
+ -> [NgramsTerm]
+ -> Cmd err [(NgramsTerm, Int)]
+selectNgramsOccurrencesOnlyByNodeUser_withSample cId int nt tms =
+ fmap (first NgramsTerm) <$>
+ runPGSQuery queryNgramsOccurrencesOnlyByNodeUser_withSample
+ ( int
+ , toDBid NodeDocument
+ , cId
+ , Values fields ((DPS.Only . unNgramsTerm) <$> tms)
+ , cId
+ , ngramsTypeId nt
+ )
+ where
+ fields = [QualifiedIdentifier Nothing "text"]
+
+queryNgramsOccurrencesOnlyByNodeUser_withSample :: DPS.Query
+queryNgramsOccurrencesOnlyByNodeUser_withSample = [sql|
+ WITH nodes_sample AS (SELECT id FROM nodes n TABLESAMPLE SYSTEM_ROWS (?)
+ JOIN nodes_nodes nn ON n.id = nn.node2_id
+ WHERE n.typename = ?
+ AND nn.node1_id = ?),
+ input_rows(terms) AS (?)
+ SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
+ JOIN ngrams ng ON nng.ngrams_id = ng.id
+ JOIN input_rows ir ON ir.terms = ng.terms
+ JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
+ JOIN nodes_sample n ON nn.node2_id = n.id
+ WHERE nn.node1_id = ? -- CorpusId
+ AND nng.ngrams_type = ? -- NgramsTypeId
+ AND nn.category > 0
+ GROUP BY nng.node2_id, ng.terms
+ |]
+
+
+
queryNgramsOccurrencesOnlyByNodeUser' :: DPS.Query
queryNgramsOccurrencesOnlyByNodeUser' = [sql|
WITH input_rows(terms) AS (?)
JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
JOIN nodes n ON nn.node2_id = n.id
WHERE nn.node1_id = ? -- CorpusId
- AND n.typename = ? -- NodeTypeId
+ AND n.typename = ? -- toDBid
AND nng.ngrams_type = ? -- NgramsTypeId
AND nn.category > 0
GROUP BY nng.node2_id, ng.terms
|]
------------------------------------------------------------------------
-getNodesByNgramsOnlyUser :: NodeId
+getNodesByNgramsOnlyUser :: HasDBid NodeType
+ => CorpusId
-> [ListId]
-> NgramsType
- -> [Text]
- -> Cmd err (Map Text (Set NodeId))
+ -> [NgramsTerm]
+ -> Cmd err (HashMap NgramsTerm (Set NodeId))
getNodesByNgramsOnlyUser cId ls nt ngs =
- Map.unionsWith (<>)
- . map (fromListWith (<>)
+ HM.unionsWith (<>)
+ . map (HM.fromListWith (<>)
. map (second Set.singleton))
<$> mapM (selectNgramsOnlyByNodeUser cId ls nt)
(splitEvery 1000 ngs)
-getNgramsByNodeOnlyUser :: NodeId
+getNgramsByNodeOnlyUser :: HasDBid NodeType
+ => NodeId
-> [ListId]
-> NgramsType
- -> [Text]
- -> Cmd err (Map NodeId (Set Text))
+ -> [NgramsTerm]
+ -> Cmd err (Map NodeId (Set NgramsTerm))
getNgramsByNodeOnlyUser cId ls nt ngs =
- Map.unionsWith (<>)
- . map (fromListWith (<>)
- . map (second Set.singleton))
+ Map.unionsWith (<>)
+ . map ( Map.fromListWith (<>)
+ . map (second Set.singleton)
+ )
. map (map swap)
<$> mapM (selectNgramsOnlyByNodeUser cId ls nt)
(splitEvery 1000 ngs)
------------------------------------------------------------------------
-selectNgramsOnlyByNodeUser :: CorpusId
+selectNgramsOnlyByNodeUser :: HasDBid NodeType
+ => CorpusId
-> [ListId]
-> NgramsType
- -> [Text]
- -> Cmd err [(Text, NodeId)]
+ -> [NgramsTerm]
+ -> Cmd err [(NgramsTerm, NodeId)]
selectNgramsOnlyByNodeUser cId ls nt tms =
+ fmap (first NgramsTerm) <$>
runPGSQuery queryNgramsOnlyByNodeUser
- ( Values fields (DPS.Only <$> tms)
+ ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
, Values [QualifiedIdentifier Nothing "int4"]
(DPS.Only <$> (map (\(NodeId n) -> n) ls))
, cId
- , nodeTypeId NodeDocument
+ , toDBid NodeDocument
, ngramsTypeId nt
)
where
JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
JOIN nodes n ON nn.node2_id = n.id
WHERE nn.node1_id = ? -- CorpusId
- AND n.typename = ? -- NodeTypeId
+ AND n.typename = ? -- toDBid
AND nng.ngrams_type = ? -- NgramsTypeId
AND nn.category > 0
GROUP BY ng.terms, nng.node2_id
|]
-selectNgramsOnlyByNodeUser' :: CorpusId
+selectNgramsOnlyByNodeUser' :: HasDBid NodeType
+ => CorpusId
-> [ListId]
-> NgramsType
-> [Text]
, Values [QualifiedIdentifier Nothing "int4"]
(DPS.Only <$> (map (\(NodeId n) -> n) ls))
, cId
- , nodeTypeId NodeDocument
+ , toDBid NodeDocument
, ngramsTypeId nt
)
where
|]
-getNgramsByDocOnlyUser :: NodeId
+getNgramsByDocOnlyUser :: DocId
-> [ListId]
-> NgramsType
- -> [Text]
- -> Cmd err (Map Text (Set NodeId))
+ -> [NgramsTerm]
+ -> Cmd err (HashMap NgramsTerm (Set NodeId))
getNgramsByDocOnlyUser cId ls nt ngs =
- Map.unionsWith (<>)
- . map (fromListWith (<>) . map (second Set.singleton))
+ HM.unionsWith (<>)
+ . map (HM.fromListWith (<>) . map (second Set.singleton))
<$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
selectNgramsOnlyByDocUser :: DocId
-> [ListId]
-> NgramsType
- -> [Text]
- -> Cmd err [(Text, NodeId)]
+ -> [NgramsTerm]
+ -> Cmd err [(NgramsTerm, NodeId)]
selectNgramsOnlyByDocUser dId ls nt tms =
+ fmap (first NgramsTerm) <$>
runPGSQuery queryNgramsOnlyByDocUser
- ( Values fields (DPS.Only <$> tms)
+ ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
, Values [QualifiedIdentifier Nothing "int4"]
(DPS.Only <$> (map (\(NodeId n) -> n) ls))
, dId
------------------------------------------------------------------------
-- | TODO filter by language, database, any social field
-getNodesByNgramsMaster :: UserCorpusId -> MasterCorpusId -> Cmd err (Map Text (Set NodeId))
-getNodesByNgramsMaster ucId mcId = Map.unionsWith (<>)
- . map (fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
+getNodesByNgramsMaster :: HasDBid NodeType
+ => UserCorpusId -> MasterCorpusId -> Cmd err (HashMap Text (Set NodeId))
+getNodesByNgramsMaster ucId mcId = unionsWith (<>)
+ . map (HM.fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
-- . takeWhile (not . List.null)
-- . takeWhile (\l -> List.length l > 3)
<$> mapM (selectNgramsByNodeMaster 1000 ucId mcId) [0,500..10000]
-selectNgramsByNodeMaster :: Int
+selectNgramsByNodeMaster :: HasDBid NodeType
+ => Int
-> UserCorpusId
-> MasterCorpusId
-> Int
queryNgramsByNodeMaster'
( ucId
, ngramsTypeId NgramsTerms
- , nodeTypeId NodeDocument
+ , toDBid NodeDocument
, p
- , nodeTypeId NodeDocument
+ , toDBid NodeDocument
, p
, n
, mcId
- , nodeTypeId NodeDocument
+ , toDBid NodeDocument
, ngramsTypeId NgramsTerms
)
JOIN node_node_ngrams nng ON nng.node2_id = n.id
JOIN ngrams ng ON nng.ngrams_id = ng.id
WHERE nn.node1_id = ? -- UserCorpusId
- -- AND n.typename = ? -- NodeTypeId
+ -- AND n.typename = ? -- toDBid
AND nng.ngrams_type = ? -- NgramsTypeId
AND nn.category > 0
AND node_pos(n.id,?) >= ?
JOIN node_node_ngrams nng ON n.id = nng.node2_id
JOIN ngrams ng ON ng.id = nng.ngrams_id
- WHERE n.parent_id = ? -- Master Corpus NodeTypeId
- AND n.typename = ? -- NodeTypeId
+ WHERE n.parent_id = ? -- Master Corpus toDBid
+ AND n.typename = ? -- toDBid
AND nng.ngrams_type = ? -- NgramsTypeId
GROUP BY n.id, ng.terms
)