import Control.Arrow (returnA)
import Control.Lens ((^.))
-import Data.Aeson
-import Data.List (intersperse)
+import qualified Data.List as List
+import qualified Data.Map.Strict as Map
import Data.Maybe
-import Data.String (IsString(..))
-import Data.Text (Text, words, unpack, intercalate)
+import qualified Data.Set as Set
+import Data.Text (Text, unpack, intercalate)
import Data.Time (UTCTime)
-import Database.PostgreSQL.Simple (Query)
-import Database.PostgreSQL.Simple.ToField
+import Gargantext.Core
import Gargantext.Core.Types
-import Gargantext.Database.Admin.Config (nodeTypeId)
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..), HyperdataContact(..))
-import Gargantext.Database.Admin.Types.Node (NodeType(..))
-import Gargantext.Database.Prelude (Cmd, runPGSQuery, runOpaQuery, runCountOpaQuery)
+import Gargantext.Database.Prelude (Cmd, runOpaQuery, runCountOpaQuery)
import Gargantext.Database.Query.Facet
import Gargantext.Database.Query.Filter
import Gargantext.Database.Query.Join (leftJoin5)
import Gargantext.Database.Query.Table.Node
-import Gargantext.Database.Query.Table.NodeNode
+import Gargantext.Database.Query.Table.Context
+import Gargantext.Database.Query.Table.ContextNodeNgrams (queryContextNodeNgramsTable)
+import Gargantext.Database.Query.Table.NodeContext
+import Gargantext.Database.Query.Table.NodeContext_NodeContext
+import Gargantext.Database.Schema.ContextNodeNgrams (ContextNodeNgramsPoly(..))
+import Gargantext.Database.Schema.Ngrams (NgramsType(..))
import Gargantext.Database.Schema.Node
+import Gargantext.Database.Schema.Context
import Gargantext.Prelude
import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
-import Opaleye hiding (Query, Order)
+import Opaleye hiding (Order)
import Data.Profunctor.Product (p4)
import qualified Opaleye as O hiding (Order)
------------------------------------------------------------------------
-searchDocInDatabase :: ParentId
- -> Text
- -> Cmd err [(NodeId, HyperdataDocument)]
+searchDocInDatabase :: HasDBid NodeType
+ => ParentId
+ -> Text
+ -> Cmd err [(NodeId, HyperdataDocument)]
searchDocInDatabase p t = runOpaQuery (queryDocInDatabase p t)
where
- -- | Global search query where ParentId is Master Node Corpus Id
- queryDocInDatabase :: ParentId -> Text -> O.Query (Column PGInt4, Column PGJsonb)
- queryDocInDatabase _ q = proc () -> do
+ -- | Global search query where ParentId is Master Node Corpus Id
+ queryDocInDatabase :: ParentId -> Text -> O.Select (Column SqlInt4, Column SqlJsonb)
+ queryDocInDatabase _p q = proc () -> do
row <- queryNodeSearchTable -< ()
- restrict -< (_ns_search row) @@ (pgTSQuery (unpack q))
- restrict -< (_ns_typename row) .== (pgInt4 $ nodeTypeId NodeDocument)
+ restrict -< (_ns_search row) @@ (sqlTSQuery (unpack q))
+ restrict -< (_ns_typename row) .== (sqlInt4 $ toDBid NodeDocument)
returnA -< (_ns_id row, _ns_hyperdata row)
+------------------------------------------------------------------------
+-- | Search ngrams in documents, ranking them by TF-IDF. We narrow our
+-- search only to map/candidate terms.
+searchInCorpusWithNgrams :: HasDBid NodeType
+ => CorpusId
+ -> ListId
+ -> IsTrash
+ -> NgramsType
+ -> [[Text]]
+ -> Maybe Offset
+ -> Maybe Limit
+ -> Maybe OrderBy
+ -> Cmd err [FacetDoc]
+searchInCorpusWithNgrams _cId _lId _t _ngt _q _o _l _order = undefined
+
+-- | Compute TF-IDF for all 'ngramIds' in given 'CorpusId'. In this
+-- case only the "TF" part makes sense and so we only compute the
+-- ratio of "number of times our terms appear in given document" and
+-- "number of all terms in document" and return a sorted list of
+-- document ids
+tfidfAll :: CorpusId -> [Int] -> Cmd err [Int]
+tfidfAll cId ngramIds = do
+ let ngramIdsSet = Set.fromList ngramIds
+ docsWithNgrams <- runOpaQuery (queryCorpusWithNgrams cId ngramIds) :: Cmd err [(Int, Int, Int)]
+ -- NOTE The query returned docs with ANY ngramIds. We need to further
+ -- restrict to ALL ngramIds.
+ let docsNgramsM =
+ Map.fromListWith (Set.union)
+ [ (ctxId, Set.singleton ngrams_id)
+ | (ctxId, ngrams_id, _) <- docsWithNgrams]
+ let docsWithAllNgramsS = Set.fromList $ List.map fst $
+ List.filter (\(_, docNgrams) ->
+ ngramIdsSet == Set.intersection ngramIdsSet docNgrams) $ Map.toList docsNgramsM
+ let docsWithAllNgrams =
+ List.filter (\(ctxId, _, _) ->
+ Set.member ctxId docsWithAllNgramsS) docsWithNgrams
+ printDebug "[tfidfAll] docsWithAllNgrams" docsWithAllNgrams
+ let docsWithCounts = Map.fromListWith (+) [ (ctxId, doc_count)
+ | (ctxId, _, doc_count) <- docsWithAllNgrams]
+ printDebug "[tfidfAll] docsWithCounts" docsWithCounts
+ let totals = [ ( ctxId
+ , ngrams_id
+ , fromIntegral doc_count :: Double
+ , fromIntegral (fromMaybe 0 $ Map.lookup ctxId docsWithCounts) :: Double)
+ | (ctxId, ngrams_id, doc_count) <- docsWithAllNgrams]
+ let tfidf_sorted = List.sortOn snd [(ctxId, doc_count/s)
+ | (ctxId, _, doc_count, s) <- totals]
+ pure $ List.map fst $ List.reverse tfidf_sorted
+
+-- | Query for searching the 'context_node_ngrams' table so that we
+-- find docs with ANY given 'ngramIds'.
+queryCorpusWithNgrams :: CorpusId -> [Int] -> Select (Column SqlInt4, Column SqlInt4, Column SqlInt4)
+queryCorpusWithNgrams cId ngramIds = proc () -> do
+ row <- queryContextNodeNgramsTable -< ()
+ restrict -< (_cnng_node_id row) .== (pgNodeId cId)
+ restrict -< in_ (sqlInt4 <$> ngramIds) (_cnng_ngrams_id row)
+ returnA -< ( _cnng_context_id row
+ , _cnng_ngrams_id row
+ , _cnng_doc_count row)
+ --returnA -< row
+ -- returnA -< ( _cnng_context_id row
+ -- , _cnng_node_id row
+ -- , _cnng_ngrams_id row
+ -- , _cnng_ngramsType row
+ -- , _cnng_weight row
+ -- , _cnng_doc_count row)
+
+
------------------------------------------------------------------------
-- | todo add limit and offset and order
-searchInCorpus :: CorpusId
+searchInCorpus :: HasDBid NodeType
+ => CorpusId
-> IsTrash
-> [Text]
-> Maybe Offset
$ intercalate " | "
$ map stemIt q
-searchCountInCorpus :: CorpusId
+searchCountInCorpus :: HasDBid NodeType
+ => CorpusId
-> IsTrash
-> [Text]
-> Cmd err Int
$ intercalate " | "
$ map stemIt q
-queryInCorpus :: CorpusId
+queryInCorpus :: HasDBid NodeType
+ => CorpusId
-> IsTrash
-> Text
- -> O.Query FacetDocRead
+ -> O.Select FacetDocRead
queryInCorpus cId t q = proc () -> do
- (n, nn) <- joinInCorpus -< ()
- restrict -< (nn^.nn_node1_id) .== (toNullable $ pgNodeId cId)
+ (c, nc) <- joinInCorpus -< ()
+ restrict -< (nc^.nc_node_id) .== (toNullable $ pgNodeId cId)
restrict -< if t
- then (nn^.nn_category) .== (toNullable $ pgInt4 0)
- else (nn^.nn_category) .>= (toNullable $ pgInt4 1)
- restrict -< (n ^. ns_search) @@ (pgTSQuery (unpack q))
- restrict -< (n ^. ns_typename ) .== (pgInt4 $ nodeTypeId NodeDocument)
- returnA -< FacetDoc (n^.ns_id )
- (n^.ns_date )
- (n^.ns_name )
- (n^.ns_hyperdata)
- (nn^.nn_category)
- (nn^.nn_score )
+ then (nc^.nc_category) .== (toNullable $ sqlInt4 0)
+ else (nc^.nc_category) .>= (toNullable $ sqlInt4 1)
+ restrict -< (c ^. cs_search) @@ (sqlTSQuery (unpack q))
+ restrict -< (c ^. cs_typename ) .== (sqlInt4 $ toDBid NodeDocument)
+ returnA -< FacetDoc { facetDoc_id = c^.cs_id
+ , facetDoc_created = c^.cs_date
+ , facetDoc_title = c^.cs_name
+ , facetDoc_hyperdata = c^.cs_hyperdata
+ , facetDoc_category = nc^.nc_category
+ , facetDoc_ngramCount = nc^.nc_score
+ , facetDoc_score = nc^.nc_score
+ }
-joinInCorpus :: O.Query (NodeSearchRead, NodeNodeReadNull)
-joinInCorpus = leftJoin queryNodeSearchTable queryNodeNodeTable cond
+joinInCorpus :: O.Select (ContextSearchRead, NodeContextReadNull)
+joinInCorpus = leftJoin queryContextSearchTable queryNodeContextTable cond
where
- cond :: (NodeSearchRead, NodeNodeRead) -> Column PGBool
- cond (n, nn) = nn^.nn_node2_id .== _ns_id n
+ cond :: (ContextSearchRead, NodeContextRead) -> Column SqlBool
+ cond (c, nc) = nc^.nc_context_id .== _cs_id c
------------------------------------------------------------------------
searchInCorpusWithContacts
- :: CorpusId
+ :: HasDBid NodeType
+ => CorpusId
-> AnnuaireId
-> [Text]
-> Maybe Offset
searchInCorpusWithContacts cId aId q o l _order =
runOpaQuery $ limit' l
$ offset' o
- $ orderBy ( desc _fp_score)
+ $ orderBy (desc _fp_score)
$ selectGroup cId aId
$ intercalate " | "
$ map stemIt q
-selectContactViaDoc
- :: CorpusId
- -> AnnuaireId
- -> Text
- -> QueryArr ()
- ( Column (Nullable PGInt4)
- , Column (Nullable PGTimestamptz)
- , Column (Nullable PGJsonb)
- , Column (Nullable PGInt4)
- )
-selectContactViaDoc cId aId q = proc () -> do
- (doc, (corpus_doc, (_contact_doc, (annuaire_contact, contact)))) <- queryContactViaDoc -< ()
- restrict -< (doc^.ns_search) @@ (pgTSQuery $ unpack q )
- restrict -< (doc^.ns_typename) .== (pgInt4 $ nodeTypeId NodeDocument)
- restrict -< (corpus_doc^.nn_node1_id) .== (toNullable $ pgNodeId cId)
- restrict -< (annuaire_contact^.nn_node1_id) .== (toNullable $ pgNodeId aId)
- restrict -< (contact^.node_typename) .== (toNullable $ pgInt4 $ nodeTypeId NodeContact)
- returnA -< ( contact^.node_id
- , contact^.node_date
- , contact^.node_hyperdata
- , toNullable $ pgInt4 1
- )
-
-selectGroup :: NodeId
- -> NodeId
- -> Text
- -> Select FacetPairedReadNull
+selectGroup :: HasDBid NodeType
+ => CorpusId
+ -> AnnuaireId
+ -> Text
+ -> Select FacetPairedReadNull
selectGroup cId aId q = proc () -> do
(a, b, c, d) <- aggregate (p4 (groupBy, groupBy, groupBy, O.sum))
(selectContactViaDoc cId aId q) -< ()
returnA -< FacetPaired a b c d
-queryContactViaDoc :: O.Query ( NodeSearchRead
- , ( NodeNodeReadNull
- , ( NodeNodeReadNull
- , ( NodeNodeReadNull
- , NodeReadNull
- )
- )
- )
- )
+selectContactViaDoc
+ :: HasDBid NodeType
+ => CorpusId
+ -> AnnuaireId
+ -> Text
+ -> SelectArr ()
+ ( Column (Nullable SqlInt4)
+ , Column (Nullable SqlTimestamptz)
+ , Column (Nullable SqlJsonb)
+ , Column (Nullable SqlInt4)
+ )
+selectContactViaDoc cId aId query = proc () -> do
+ (doc, (corpus, (_nodeContext_nodeContext, (annuaire, contact)))) <- queryContactViaDoc -< ()
+ restrict -< (doc^.cs_search) @@ (sqlTSQuery $ unpack query )
+ restrict -< (doc^.cs_typename) .== (sqlInt4 $ toDBid NodeDocument )
+ restrict -< (corpus^.nc_node_id) .== (toNullable $ pgNodeId cId )
+ restrict -< (annuaire^.nc_node_id) .== (toNullable $ pgNodeId aId )
+ restrict -< (contact^.context_typename) .== (toNullable $ sqlInt4 $ toDBid NodeContact)
+ returnA -< ( contact^.context_id
+ , contact^.context_date
+ , contact^.context_hyperdata
+ , toNullable $ sqlInt4 1
+ )
+
+queryContactViaDoc :: O.Select ( ContextSearchRead
+ , ( NodeContextReadNull
+ , ( NodeContext_NodeContextReadNull
+ , ( NodeContextReadNull
+ , ContextReadNull
+ )
+ )
+ )
+ )
queryContactViaDoc =
leftJoin5
- queryNodeTable
- queryNodeNodeTable
- queryNodeNodeTable
- queryNodeNodeTable
- queryNodeSearchTable
+ queryContextTable
+ queryNodeContextTable
+ queryNodeContext_NodeContextTable
+ queryNodeContextTable
+ queryContextSearchTable
cond12
cond23
cond34
cond45
where
- cond12 :: (NodeNodeRead, NodeRead) -> Column PGBool
- cond12 (annuaire_contact, contact) = contact^.node_id .== annuaire_contact^.nn_node2_id
+ cond12 :: (NodeContextRead, ContextRead) -> Column SqlBool
+ cond12 (annuaire, contact) = contact^.context_id .== annuaire^.nc_context_id
- cond23 :: ( NodeNodeRead
- , ( NodeNodeRead
- , NodeReadNull
+ cond23 :: ( NodeContext_NodeContextRead
+ , ( NodeContextRead
+ , ContextReadNull
)
- ) -> Column PGBool
- cond23 (contact_doc, (annuaire_contact, _)) = contact_doc^.nn_node1_id .== annuaire_contact^.nn_node2_id
+ ) -> Column SqlBool
+ cond23 (nodeContext_nodeContext, (annuaire, _)) = nodeContext_nodeContext^.ncnc_nodecontext2 .== annuaire^.nc_id
- cond34 :: ( NodeNodeRead
- , ( NodeNodeRead
- , ( NodeNodeReadNull
- , NodeReadNull
+ cond34 :: ( NodeContextRead
+ , ( NodeContext_NodeContextRead
+ , ( NodeContextReadNull
+ , ContextReadNull
)
)
- ) -> Column PGBool
- cond34 (corpus_doc, (contact_doc, (_,_))) = corpus_doc^.nn_node2_id .== contact_doc^.nn_node2_id
+ ) -> Column SqlBool
+ cond34 (corpus, (nodeContext_nodeContext, (_,_))) = nodeContext_nodeContext^.ncnc_nodecontext1 .== corpus^.nc_id
- cond45 :: ( NodeSearchRead
- , ( NodeNodeRead
- , ( NodeNodeReadNull
- , ( NodeNodeReadNull
- , NodeReadNull
+ cond45 :: ( ContextSearchRead
+ , ( NodeContextRead
+ , ( NodeContext_NodeContextReadNull
+ , ( NodeContextReadNull
+ , ContextReadNull
)
)
)
- ) -> Column PGBool
- cond45 (doc, (corpus_doc, (_,(_,_)))) = doc^.ns_id .== corpus_doc^.nn_node2_id
-
-
-------------------------------------------------------------------------
-
-newtype TSQuery = UnsafeTSQuery [Text]
-
--- | TODO [""] -> panic "error"
-toTSQuery :: [Text] -> TSQuery
-toTSQuery txt = UnsafeTSQuery $ map stemIt txt
-
-
-instance IsString TSQuery
- where
- fromString = UnsafeTSQuery . words . cs
-
-
-instance ToField TSQuery
- where
- toField (UnsafeTSQuery xs)
- = Many $ intersperse (Plain " && ")
- $ map (\q -> Many [ Plain "plainto_tsquery("
- , Escape (cs q)
- , Plain ")"
- ]
- ) xs
-
-data Order = Asc | Desc
-
-instance ToField Order
- where
- toField Asc = Plain "ASC"
- toField Desc = Plain "DESC"
-
--- TODO
--- FIX fav
--- ADD ngrams count
--- TESTS
-textSearchQuery :: Query
-textSearchQuery = "SELECT n.id, n.hyperdata->'publication_year' \
-\ , n.hyperdata->'title' \
-\ , n.hyperdata->'source' \
-\ , n.hyperdata->'authors' \
-\ , COALESCE(nn.score,null) \
-\ FROM nodes n \
-\ LEFT JOIN nodes_nodes nn ON nn.node2_id = n.id \
-\ WHERE \
-\ n.search @@ (?::tsquery) \
-\ AND (n.parent_id = ? OR nn.node1_id = ?) \
-\ AND n.typename = ? \
-\ ORDER BY n.hyperdata -> 'publication_date' ? \
-\ offset ? limit ?;"
-
--- | Text Search Function for Master Corpus
--- TODO : text search for user corpus
--- Example:
--- textSearchTest :: ParentId -> TSQuery -> Cmd err [(Int, Value, Value, Value, Value, Maybe Int)]
--- textSearchTest pId q = textSearch q pId 5 0 Asc
-textSearch :: TSQuery -> ParentId
- -> Limit -> Offset -> Order
- -> Cmd err [(Int,Value,Value,Value, Value, Maybe Int)]
-textSearch q p l o ord = runPGSQuery textSearchQuery (q,p,p,typeId,ord,o,l)
- where
- typeId = nodeTypeId NodeDocument
-
-
+ ) -> Column SqlBool
+ cond45 (doc, (corpus, (_,(_,_)))) = doc^.cs_id .== corpus^.nc_context_id