-}
{-# LANGUAGE Arrows #-}
+{-# LANGUAGE LambdaCase #-}
module Gargantext.Database.Action.Search where
import Control.Arrow (returnA)
-import Control.Lens ((^.))
-import Data.Aeson
-import Data.List (intersperse)
+import Control.Lens ((^.), view)
+import qualified Data.List as List
+import qualified Data.Map.Strict as Map
import Data.Maybe
-import Data.String (IsString(..))
-import Data.Text (Text, words, unpack, intercalate)
+import qualified Data.Set as Set
+import Data.Text (Text, unpack, intercalate)
import Data.Time (UTCTime)
-import Database.PostgreSQL.Simple (Query)
-import Database.PostgreSQL.Simple.ToField
+import Gargantext.Core
import Gargantext.Core.Types
-import Gargantext.Database.Admin.Config (nodeTypeId)
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..), HyperdataContact(..))
-import Gargantext.Database.Prelude (Cmd, runPGSQuery, runOpaQuery, runCountOpaQuery)
+import Gargantext.Database.Prelude (Cmd, runOpaQuery, runCountOpaQuery)
import Gargantext.Database.Query.Facet
import Gargantext.Database.Query.Filter
-import Gargantext.Database.Query.Join (leftJoin5)
import Gargantext.Database.Query.Table.Node
-import Gargantext.Database.Query.Table.NodeNode
+import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
+import Gargantext.Database.Query.Table.Context
+import Gargantext.Database.Query.Table.ContextNodeNgrams (queryContextNodeNgramsTable)
+import Gargantext.Database.Query.Table.NodeContext
+import Gargantext.Database.Query.Table.NodeContext_NodeContext
+import Gargantext.Database.Schema.ContextNodeNgrams (ContextNodeNgramsPoly(..))
+import Gargantext.Database.Schema.Ngrams (NgramsType(..))
import Gargantext.Database.Schema.Node
+import Gargantext.Database.Schema.Context
import Gargantext.Prelude
import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
-import Opaleye hiding (Query, Order)
+import Opaleye hiding (Order)
import Data.Profunctor.Product (p4)
import qualified Opaleye as O hiding (Order)
------------------------------------------------------------------------
-searchDocInDatabase :: ParentId
- -> Text
- -> Cmd err [(NodeId, HyperdataDocument)]
+searchDocInDatabase :: HasDBid NodeType
+ => ParentId
+ -> Text
+ -> Cmd err [(NodeId, HyperdataDocument)]
searchDocInDatabase p t = runOpaQuery (queryDocInDatabase p t)
where
- -- | Global search query where ParentId is Master Node Corpus Id
- queryDocInDatabase :: ParentId -> Text -> O.Query (Column PGInt4, Column PGJsonb)
- queryDocInDatabase _ q = proc () -> do
+ -- | Global search query where ParentId is Master Node Corpus Id
+ queryDocInDatabase :: ParentId -> Text -> O.Select (Column SqlInt4, Column SqlJsonb)
+ queryDocInDatabase _p q = proc () -> do
row <- queryNodeSearchTable -< ()
- restrict -< (_ns_search row) @@ (pgTSQuery (unpack q))
- restrict -< (_ns_typename row) .== (pgInt4 $ nodeTypeId NodeDocument)
+ restrict -< (_ns_search row) @@ (sqlTSQuery (unpack q))
+ restrict -< (_ns_typename row) .== (sqlInt4 $ toDBid NodeDocument)
returnA -< (_ns_id row, _ns_hyperdata row)
+------------------------------------------------------------------------
+-- | Search ngrams in documents, ranking them by TF-IDF. We narrow our
+-- search only to map/candidate terms.
+searchInCorpusWithNgrams :: HasDBid NodeType
+ => CorpusId
+ -> ListId
+ -> IsTrash
+ -> NgramsType
+ -> [[Text]]
+ -> Maybe Offset
+ -> Maybe Limit
+ -> Maybe OrderBy
+ -> Cmd err [FacetDoc]
+searchInCorpusWithNgrams _cId _lId _t _ngt _q _o _l _order = undefined
+
+-- | Compute TF-IDF for all 'ngramIds' in given 'CorpusId'. In this
+-- case only the "TF" part makes sense and so we only compute the
+-- ratio of "number of times our terms appear in given document" and
+-- "number of all terms in document" and return a sorted list of
+-- document ids
+tfidfAll :: (HasDBid NodeType, HasNodeError err) => CorpusId -> [Int] -> Cmd err [Int]
+tfidfAll cId ngramIds = do
+ let ngramIdsSet = Set.fromList ngramIds
+ lId <- defaultList cId
+ docsWithNgrams <- runOpaQuery (queryListWithNgrams lId ngramIds) :: Cmd err [(Int, Int, Int)]
+ -- NOTE The query returned docs with ANY ngramIds. We need to further
+ -- restrict to ALL ngramIds.
+ let docsNgramsM =
+ Map.fromListWith (Set.union)
+ [ (ctxId, Set.singleton ngrams_id)
+ | (ctxId, ngrams_id, _) <- docsWithNgrams]
+ let docsWithAllNgramsS = Set.fromList $ List.map fst $
+ List.filter (\(_, docNgrams) ->
+ ngramIdsSet == Set.intersection ngramIdsSet docNgrams) $ Map.toList docsNgramsM
+ let docsWithAllNgrams =
+ List.filter (\(ctxId, _, _) ->
+ Set.member ctxId docsWithAllNgramsS) docsWithNgrams
+ -- printDebug "[tfidfAll] docsWithAllNgrams" docsWithAllNgrams
+ let docsWithCounts = Map.fromListWith (+) [ (ctxId, doc_count)
+ | (ctxId, _, doc_count) <- docsWithAllNgrams]
+ -- printDebug "[tfidfAll] docsWithCounts" docsWithCounts
+ let totals = [ ( ctxId
+ , ngrams_id
+ , fromIntegral doc_count :: Double
+ , fromIntegral (fromMaybe 0 $ Map.lookup ctxId docsWithCounts) :: Double)
+ | (ctxId, ngrams_id, doc_count) <- docsWithAllNgrams]
+ let tfidf_sorted = List.sortOn snd [(ctxId, doc_count/s)
+ | (ctxId, _, doc_count, s) <- totals]
+ pure $ List.map fst $ List.reverse tfidf_sorted
+
+-- | Query for searching the 'context_node_ngrams' table so that we
+-- find docs with ANY given 'ngramIds'.
+queryListWithNgrams :: ListId -> [Int] -> Select (Column SqlInt4, Column SqlInt4, Column SqlInt4)
+queryListWithNgrams lId ngramIds = proc () -> do
+ row <- queryContextNodeNgramsTable -< ()
+ restrict -< (_cnng_node_id row) .== (pgNodeId lId)
+ restrict -< in_ (sqlInt4 <$> ngramIds) (_cnng_ngrams_id row)
+ returnA -< ( _cnng_context_id row
+ , _cnng_ngrams_id row
+ , _cnng_doc_count row )
+ --returnA -< row
+ -- returnA -< ( _cnng_context_id row
+ -- , _cnng_node_id row
+ -- , _cnng_ngrams_id row
+ -- , _cnng_ngramsType row
+ -- , _cnng_weight row
+ -- , _cnng_doc_count row)
+
+
------------------------------------------------------------------------
-- | todo add limit and offset and order
-searchInCorpus :: CorpusId
+searchInCorpus :: HasDBid NodeType
+ => CorpusId
-> IsTrash
-> [Text]
-> Maybe Offset
$ intercalate " | "
$ map stemIt q
-searchCountInCorpus :: CorpusId
+searchCountInCorpus :: HasDBid NodeType
+ => CorpusId
-> IsTrash
-> [Text]
-> Cmd err Int
$ intercalate " | "
$ map stemIt q
-queryInCorpus :: CorpusId
+queryInCorpus :: HasDBid NodeType
+ => CorpusId
-> IsTrash
-> Text
- -> O.Query FacetDocRead
+ -> O.Select FacetDocRead
queryInCorpus cId t q = proc () -> do
- (n, nn) <- joinInCorpus -< ()
- restrict -< (nn^.nn_node1_id) .== (toNullable $ pgNodeId cId)
+ c <- queryContextSearchTable -< ()
+ nc <- optionalRestrict queryNodeContextTable -<
+ \nc' -> (nc' ^. nc_context_id) .== _cs_id c
+ restrict -< (view nc_node_id <$> nc) .=== justFields (pgNodeId cId)
restrict -< if t
- then (nn^.nn_category) .== (toNullable $ pgInt4 0)
- else (nn^.nn_category) .>= (toNullable $ pgInt4 1)
- restrict -< (n ^. ns_search) @@ (pgTSQuery (unpack q))
- restrict -< (n ^. ns_typename ) .== (pgInt4 $ nodeTypeId NodeDocument)
- returnA -< FacetDoc (n^.ns_id )
- (n^.ns_date )
- (n^.ns_name )
- (n^.ns_hyperdata)
- (nn^.nn_category)
- (nn^.nn_score )
-
-joinInCorpus :: O.Query (NodeSearchRead, NodeNodeReadNull)
-joinInCorpus = leftJoin queryNodeSearchTable queryNodeNodeTable cond
- where
- cond :: (NodeSearchRead, NodeNodeRead) -> Column PGBool
- cond (n, nn) = nn^.nn_node2_id .== _ns_id n
+ then (view nc_category <$> nc) .=== justFields (sqlInt4 0)
+ else matchMaybe (view nc_category <$> nc) $ \case
+ Nothing -> toFields False
+ Just c' -> c' .>= sqlInt4 1
+ restrict -< (c ^. cs_search) @@ sqlTSQuery (unpack q)
+ restrict -< (c ^. cs_typename ) .== sqlInt4 (toDBid NodeDocument)
+ returnA -< FacetDoc { facetDoc_id = c^.cs_id
+ , facetDoc_created = c^.cs_date
+ , facetDoc_title = c^.cs_name
+ , facetDoc_hyperdata = c^.cs_hyperdata
+ , facetDoc_category = maybeFieldsToNullable (view nc_category <$> nc)
+ , facetDoc_ngramCount = maybeFieldsToNullable (view nc_score <$> nc)
+ , facetDoc_score = maybeFieldsToNullable (view nc_score <$> nc)
+ }
------------------------------------------------------------------------
searchInCorpusWithContacts
- :: CorpusId
+ :: HasDBid NodeType
+ => CorpusId
-> AnnuaireId
-> [Text]
-> Maybe Offset
searchInCorpusWithContacts cId aId q o l _order =
runOpaQuery $ limit' l
$ offset' o
- $ orderBy ( desc _fp_score)
+ $ orderBy (desc _fp_score)
$ selectGroup cId aId
$ intercalate " | "
$ map stemIt q
-selectContactViaDoc
- :: CorpusId
- -> AnnuaireId
- -> Text
- -> QueryArr ()
- ( Column (Nullable PGInt4)
- , Column (Nullable PGTimestamptz)
- , Column (Nullable PGJsonb)
- , Column (Nullable PGInt4)
- )
-selectContactViaDoc cId aId q = proc () -> do
- (doc, (corpus_doc, (_contact_doc, (annuaire_contact, contact)))) <- queryContactViaDoc -< ()
- restrict -< (doc^.ns_search) @@ (pgTSQuery $ unpack q )
- restrict -< (doc^.ns_typename) .== (pgInt4 $ nodeTypeId NodeDocument)
- restrict -< (corpus_doc^.nn_node1_id) .== (toNullable $ pgNodeId cId)
- restrict -< (annuaire_contact^.nn_node1_id) .== (toNullable $ pgNodeId aId)
- restrict -< (contact^.node_typename) .== (toNullable $ pgInt4 $ nodeTypeId NodeContact)
- returnA -< ( contact^.node_id
- , contact^.node_date
- , contact^.node_hyperdata
- , toNullable $ pgInt4 1
- )
-
-selectGroup :: NodeId
- -> NodeId
- -> Text
- -> Select FacetPairedReadNull
+selectGroup :: HasDBid NodeType
+ => CorpusId
+ -> AnnuaireId
+ -> Text
+ -> Select FacetPairedRead
selectGroup cId aId q = proc () -> do
(a, b, c, d) <- aggregate (p4 (groupBy, groupBy, groupBy, O.sum))
(selectContactViaDoc cId aId q) -< ()
returnA -< FacetPaired a b c d
-queryContactViaDoc :: O.Query ( NodeSearchRead
- , ( NodeNodeReadNull
- , ( NodeNodeReadNull
- , ( NodeNodeReadNull
- , NodeReadNull
- )
- )
- )
- )
-queryContactViaDoc =
- leftJoin5
- queryNodeTable
- queryNodeNodeTable
- queryNodeNodeTable
- queryNodeNodeTable
- queryNodeSearchTable
- cond12
- cond23
- cond34
- cond45
- where
- cond12 :: (NodeNodeRead, NodeRead) -> Column PGBool
- cond12 (annuaire_contact, contact) = contact^.node_id .== annuaire_contact^.nn_node2_id
-
- cond23 :: ( NodeNodeRead
- , ( NodeNodeRead
- , NodeReadNull
- )
- ) -> Column PGBool
- cond23 (contact_doc, (annuaire_contact, _)) = contact_doc^.nn_node1_id .== annuaire_contact^.nn_node2_id
-
- cond34 :: ( NodeNodeRead
- , ( NodeNodeRead
- , ( NodeNodeReadNull
- , NodeReadNull
- )
- )
- ) -> Column PGBool
- cond34 (corpus_doc, (contact_doc, (_,_))) = corpus_doc^.nn_node2_id .== contact_doc^.nn_node2_id
-
-
- cond45 :: ( NodeSearchRead
- , ( NodeNodeRead
- , ( NodeNodeReadNull
- , ( NodeNodeReadNull
- , NodeReadNull
- )
- )
- )
- ) -> Column PGBool
- cond45 (doc, (corpus_doc, (_,(_,_)))) = doc^.ns_id .== corpus_doc^.nn_node2_id
-
-
-------------------------------------------------------------------------
-
-newtype TSQuery = UnsafeTSQuery [Text]
-
--- | TODO [""] -> panic "error"
-toTSQuery :: [Text] -> TSQuery
-toTSQuery txt = UnsafeTSQuery $ map stemIt txt
-
-
-instance IsString TSQuery
- where
- fromString = UnsafeTSQuery . words . cs
-
-
-instance ToField TSQuery
- where
- toField (UnsafeTSQuery xs)
- = Many $ intersperse (Plain " && ")
- $ map (\q -> Many [ Plain "plainto_tsquery("
- , Escape (cs q)
- , Plain ")"
- ]
- ) xs
-
-data Order = Asc | Desc
-
-instance ToField Order
- where
- toField Asc = Plain "ASC"
- toField Desc = Plain "DESC"
-
--- TODO
--- FIX fav
--- ADD ngrams count
--- TESTS
-textSearchQuery :: Query
-textSearchQuery = "SELECT n.id, n.hyperdata->'publication_year' \
-\ , n.hyperdata->'title' \
-\ , n.hyperdata->'source' \
-\ , n.hyperdata->'authors' \
-\ , COALESCE(nn.score,null) \
-\ FROM nodes n \
-\ LEFT JOIN nodes_nodes nn ON nn.node2_id = n.id \
-\ WHERE \
-\ n.search @@ (?::tsquery) \
-\ AND (n.parent_id = ? OR nn.node1_id = ?) \
-\ AND n.typename = ? \
-\ ORDER BY n.hyperdata -> 'publication_date' ? \
-\ offset ? limit ?;"
-
--- | Text Search Function for Master Corpus
--- TODO : text search for user corpus
--- Example:
--- textSearchTest :: ParentId -> TSQuery -> Cmd err [(Int, Value, Value, Value, Value, Maybe Int)]
--- textSearchTest pId q = textSearch q pId 5 0 Asc
-textSearch :: TSQuery -> ParentId
- -> Limit -> Offset -> Order
- -> Cmd err [(Int,Value,Value,Value, Value, Maybe Int)]
-textSearch q p l o ord = runPGSQuery textSearchQuery (q,p,p,typeId,ord,o,l)
- where
- typeId = nodeTypeId NodeDocument
-
+selectContactViaDoc
+ :: HasDBid NodeType
+ => CorpusId
+ -> AnnuaireId
+ -> Text
+ -> SelectArr ()
+ ( Field SqlInt4
+ , Field SqlTimestamptz
+ , Field SqlJsonb
+ , Field SqlInt4
+ )
+selectContactViaDoc cId aId query = proc () -> do
+ --(doc, (corpus, (_nodeContext_nodeContext, (annuaire, contact)))) <- queryContactViaDoc -< ()
+ (contact, annuaire, _, corpus, doc) <- queryContactViaDoc -< ()
+ restrict -< matchMaybe (view cs_search <$> doc) $ \case
+ Nothing -> toFields False
+ Just s -> s @@ sqlTSQuery (unpack query)
+ restrict -< (view cs_typename <$> doc) .=== justFields (sqlInt4 (toDBid NodeDocument))
+ restrict -< (view nc_node_id <$> corpus) .=== justFields (pgNodeId cId)
+ restrict -< (view nc_node_id <$> annuaire) .=== justFields (pgNodeId aId)
+ restrict -< (contact ^. context_typename) .== sqlInt4 (toDBid NodeContact)
+ returnA -< ( contact ^. context_id
+ , contact ^. context_date
+ , contact ^. context_hyperdata
+ , sqlInt4 1
+ )
+queryContactViaDoc :: O.Select ( ContextRead
+ , MaybeFields NodeContextRead
+ , MaybeFields NodeContext_NodeContextRead
+ , MaybeFields NodeContextRead
+ , MaybeFields ContextSearchRead )
+queryContactViaDoc = proc () -> do
+ contact <- queryContextTable -< ()
+ annuaire <- optionalRestrict queryNodeContextTable -<
+ \annuaire' -> (annuaire' ^. nc_context_id) .== (contact ^. context_id)
+ nodeContext_nodeContext <- optionalRestrict queryNodeContext_NodeContextTable -<
+ \ncnc' -> justFields (ncnc' ^. ncnc_nodecontext2) .=== (view nc_id <$> annuaire)
+ corpus <- optionalRestrict queryNodeContextTable -<
+ \corpus' -> justFields (corpus' ^. nc_id) .=== (view ncnc_nodecontext1 <$> nodeContext_nodeContext)
+ doc <- optionalRestrict queryContextSearchTable -<
+ \doc' -> justFields (doc' ^. cs_id) .=== (view nc_context_id <$> corpus)
+
+ returnA -< (contact, annuaire, nodeContext_nodeContext, corpus, doc)