[FIX] Clean Text before sending it to NLP micro services + tests + clean code for...

[gargantext.git] / src / Gargantext / Database / Action / Flow / Pairing.hs
diff --git a/src/Gargantext/Database/Action/Flow/Pairing.hs b/src/Gargantext/Database/Action/Flow/Pairing.hs

index 6d81815b2fb4694615faf8e408ee6d10457ebd39..67ee5988d782212ef29e0ba4a18558513cb3123e 100644 (file)
--- a/src/Gargantext/Database/Action/Flow/Pairing.hs
+++ b/src/Gargantext/Database/Action/Flow/Pairing.hs
@@ -9,130 +9,193 @@ Portability : POSIX
  
  -}
  
-{-# LANGUAGE QuasiQuotes       #-}
--- {-# LANGUAGE Arrows #-}
+{-# LANGUAGE QuasiQuotes #-}
+{-# LANGUAGE Arrows      #-}
  
  module Gargantext.Database.Action.Flow.Pairing
-  (pairing)
+  -- (pairing)
      where
  
-import Control.Lens (_Just, (^.))
-import Data.Map (Map, fromList)
-import Data.Maybe (catMaybes)
-import Data.Text (Text, toLower)
-import Database.PostgreSQL.Simple.SqlQQ (sql)
+import Debug.Trace (trace)
+import Control.Lens (_Just, (^.), view)
+import Data.Hashable (Hashable)
+import Data.HashMap.Strict (HashMap)
+import Data.Maybe (fromMaybe, catMaybes)
+import Data.Set (Set)
+import Data.Text (Text)
+import Gargantext.API.Ngrams.Tools
+import Gargantext.API.Ngrams.Types (NgramsTerm(..))
+import Gargantext.API.Prelude (GargNoServer)
+import Gargantext.Core
+import Gargantext.Core.Text.Metrics.CharByChar (levenshtein)
  import Gargantext.Core.Types (TableResult(..))
-import Gargantext.Database.Action.Flow.Utils
-import Gargantext.Database.Admin.Types.Node (AnnuaireId, CorpusId, ListId{-, DocId, ContactId-})
-import Gargantext.Database.Prelude (Cmd, runPGSQuery)
-import Gargantext.Database.Query.Table.Node.Children (getAllContacts)
+import Gargantext.Core.Types.Main
+import Gargantext.Database
+import Gargantext.Database.Action.Metrics.NgramsByContext (getContextsByNgramsOnlyUser)
+import Gargantext.Database.Admin.Config
  import Gargantext.Database.Admin.Types.Hyperdata -- (HyperdataContact(..))
+import Gargantext.Database.Admin.Types.Node -- (AnnuaireId, CorpusId, ListId, DocId, ContactId, NodeId)
+import Gargantext.Database.Query.Prelude (returnA, queryNodeNodeTable)
+import Gargantext.Database.Query.Table.Node (defaultList)
+import Gargantext.Database.Query.Table.Node.Children (getAllContacts)
+import Gargantext.Database.Query.Table.Node.Select (selectNodesWithUsername)
+import Gargantext.Database.Query.Table.NodeContext_NodeContext (insertNodeContext_NodeContext)
+import Gargantext.Database.Query.Table.NodeNode (insertNodeNode)
  import Gargantext.Database.Schema.Ngrams -- (NgramsType(..))
+import Gargantext.Database.Schema.Node
+-- import Gargantext.Database.Schema.Context
+import qualified Data.HashMap.Strict as HM
  import Gargantext.Prelude hiding (sum)
-import Safe (lastMay)
-import qualified Data.Map  as DM
-import qualified Data.Text as DT
-
--- TODO mv this type in Types Main
-type Terms = Text
-
-{-
-pairing'' :: (CorpusId, CorpusId) -> (DocId -> DocId)
-pairing'' = undefined
-
-pairing' :: (CorpusId, AnnuaireId) -> (DocId -> ContactId)
-pairing' = undefined
--}
-
--- | TODO : add paring policy as parameter
-pairing :: CorpusId   -- (CorpusId,   ListId) -- Pair (Either CorpusId AnnuaireId) ListId
-        -> AnnuaireId -- (AnnuaireId, ListId) -- Pair (Either CorpusId AnnuaireId) ListId
-        -> ListId
-        -> Cmd err Int
-pairing cId aId lId = do
-  contacts' <- getAllContacts aId
-  let contactsMap = pairingPolicyToMap toLower
-                  $ toMaps extractNgramsT (tr_docs contacts')
-
-  ngramsMap' <- getNgramsTindexed cId Authors
-  let ngramsMap = pairingPolicyToMap lastName ngramsMap'
-
-  let indexedNgrams = pairMaps contactsMap ngramsMap
-
-  insertDocNgrams lId indexedNgrams
-
-lastName :: Terms -> Terms
-lastName texte = DT.toLower
-               $ maybe texte (\x -> if DT.length x > 3 then x else texte) (lastName' texte)
-  where
-    lastName' = lastMay . DT.splitOn " "
-
--- TODO: this method is dangerous (maybe equalities of the result are
--- not taken into account emergency demo plan...)
-pairingPolicyToMap :: (Terms -> Terms)
-                   -> Map (NgramsT Ngrams) a
-                   -> Map (NgramsT Ngrams) a
-pairingPolicyToMap f = DM.mapKeys (pairingPolicy f)
-
-pairingPolicy :: (Terms -> Terms)
-              -> NgramsT Ngrams
-              -> NgramsT Ngrams
-pairingPolicy f (NgramsT nt (Ngrams ng _)) = (NgramsT nt (Ngrams (f ng) 1))
-
--- | TODO : use Occurrences in place of Int
-extractNgramsT :: HyperdataContact
-               -> Map (NgramsT Ngrams) Int
-extractNgramsT contact = fromList [(NgramsT Authors    a' , 1)| a' <- authors    ]
+import Opaleye
+import qualified Data.HashMap.Strict as HashMap
+import qualified Data.List           as List
+import qualified Data.Set            as Set
+import qualified Data.Text           as Text
+
+-- | isPairedWith
+-- All NodeAnnuaire paired with a Corpus of NodeId nId:
+-- isPairedWith NodeAnnuaire corpusId
+isPairedWith :: NodeId -> NodeType -> Cmd err [NodeId]
+isPairedWith nId nt = runOpaQuery (selectQuery nt nId)
    where
-    authors    = map text2ngrams
-               $ catMaybes [ contact^.(hc_who . _Just . cw_lastName) ]
+    selectQuery :: NodeType -> NodeId -> Select (Column SqlInt4)
+    selectQuery nt' nId' = proc () -> do
+      node <- queryNodeTable -< ()
+      node_node <- optionalRestrict queryNodeNodeTable -<
+        \node_node' -> (node ^. node_id) .== (node_node' ^. nn_node2_id)
+      restrict -< (node^.node_typename)  .== sqlInt4 (toDBid nt')
+      restrict -< (view nn_node1_id <$> node_node) .=== justFields (pgNodeId nId')
+      returnA  -<  node^.node_id
  
+-----------------------------------------------------------------------
+pairing :: AnnuaireId -> CorpusId -> Maybe ListId -> GargNoServer [Int]
+pairing a c l' = do
+  l <- case l' of
+    Nothing -> defaultList c
+    Just l'' -> pure l''
+  dataPaired <- dataPairing a (c,l,Authors)
+  _ <- insertNodeNode [ NodeNode c a Nothing Nothing]
+  insertNodeContext_NodeContext $ prepareInsert c a dataPaired
+
+
+dataPairing :: AnnuaireId
+             -> (CorpusId, ListId, NgramsType)
+             -> GargNoServer (HashMap ContactId (Set DocId))
+dataPairing aId (cId, lId, ngt) = do
+  -- mc :: HM.HashMap ContactName (Set ContactId)
+  mc <- getNgramsContactId aId
+  -- md :: HM.HashMap DocAuthor   (Set DocId)
+  md <- getNgramsDocId     cId lId ngt
+  -- printDebug "dataPairing authors" (HM.keys md)
+  let result = fusion mc md
+  -- printDebug "dataPairing" (length $ HM.keys result)
+  pure result
+
+
+prepareInsert :: CorpusId -> AnnuaireId -> HashMap ContactId (Set DocId)
+              -> [(CorpusId, AnnuaireId, DocId, ContactId)]
+prepareInsert corpusId annuaireId mapContactDocs =
+  map (\(contactId,docId) -> (corpusId, docId, annuaireId, contactId))
+        $ List.concat
+        $ map (\(contactId, setDocIds)
+                -> map (\setDocId
+                         -> (contactId, setDocId)
+                       ) $ Set.toList setDocIds
+               )
+        $ HM.toList mapContactDocs
+
+------------------------------------------------------------------------
+type ContactName = NgramsTerm
+type DocAuthor   = NgramsTerm
+type Projected   = NgramsTerm
+
+fusion :: HashMap ContactName (Set ContactId)
+       -> HashMap DocAuthor   (Set DocId)
+       -> HashMap ContactId   (Set DocId)
+fusion mc md = HM.fromListWith (<>)
+             $ List.concat
+             $ map (\(docAuthor, docs)
+                     -> case (getClosest Text.toLower docAuthor (HM.keys mc)) of
+                          Nothing -> []
+                          Just author -> case HM.lookup author mc of
+                                          Nothing    -> []
+                                          Just contactIds -> map (\contactId -> (contactId, docs))
+                                                                 $ Set.toList contactIds
+                   )
+             $ HM.toList md
+
+fusion'' :: HashMap ContactName (Set ContactId)
+       -> HashMap DocAuthor   (Set DocId)
+       -> HashMap ContactId   (Set DocId)
+fusion'' mc md = hashmapReverse $ fusion' mc (hashmapReverse md)
+
+
+fusion' :: HashMap ContactName (Set ContactId)
+       -> HashMap DocId (Set DocAuthor)
+       -> HashMap DocId (Set ContactId)
+fusion' mc md = HM.fromListWith (<>)
+             $ map (\(docId, setAuthors) -> (docId, getContactIds mc $ getClosest' setAuthors (HM.keys mc)))
+             $ HM.toList md
+
+getContactIds :: HashMap ContactName (Set ContactId) -> Set ContactName -> Set ContactId
+getContactIds mapContactNames contactNames =
+  if Set.null contactNames
+     then Set.empty
+     else Set.unions $ catMaybes $ map (\contactName -> HM.lookup contactName mapContactNames) $ Set.toList contactNames
+
+getClosest' :: Set DocAuthor -> [ContactName] -> Set ContactName
+getClosest' setAuthors contactNames = trace (show (setAuthors, setContactNames)) $ setContactNames
+  where
+    setContactNames = if Set.null xs then ys else xs
+    xs = Set.fromList $ catMaybes $ map (\author -> getClosest Text.toLower author contactNames) $ Set.toList setAuthors
+    ys = Set.fromList $ catMaybes $ map (\(NgramsTerm author) -> case ((lastMay . (Text.splitOn " ")) author) of
+                                                      Nothing      -> Nothing
+                                                      Just authorReduced -> getClosest Text.toLower (NgramsTerm authorReduced) contactNames)
+                                  $ Set.toList setAuthors
  
-pairMaps :: Map (NgramsT Ngrams) a
-         -> Map (NgramsT Ngrams) NgramsId
-         -> Map NgramsIndexed (Map NgramsType a)
-pairMaps m1 m2 =
-  DM.fromList
-    [ (NgramsIndexed ng nId, DM.singleton nt n2i)
-    | (k@(NgramsT nt ng),n2i) <- DM.toList m1
-    , Just nId <- [DM.lookup k m2]
-    ]
  
------------------------------------------------------------------------
-getNgramsTindexed :: CorpusId
-                  -> NgramsType
-                  -> Cmd err (Map (NgramsT Ngrams) NgramsId)
-getNgramsTindexed corpusId ngramsType' = fromList
-    <$> map (\(ngramsId',t,n) -> (NgramsT ngramsType' (Ngrams t n),ngramsId'))
-    <$> selectNgramsTindexed corpusId ngramsType'
+getClosest :: (Text -> Text) -> NgramsTerm -> [NgramsTerm] -> Maybe NgramsTerm
+getClosest f (NgramsTerm from) candidates = fst <$> head scored
+  where
+    scored   = List.sortOn snd
+             $ List.filter (\(_,score) -> score <= 2)
+             $ map (\cand@(NgramsTerm candidate) -> (cand, levenshtein (f from) (f candidate))) candidates
+
+
+------------------------------------------------------------------------
+getNgramsContactId :: AnnuaireId
+                   -> Cmd err (HashMap ContactName (Set NodeId))
+getNgramsContactId aId = do
+  contacts <- getAllContacts aId
+  -- printDebug "getAllContexts" (tr_count contacts)
+  let paired= HM.fromListWith (<>)
+       $ map (\contact -> (toName contact, Set.singleton (contact^.node_id))
+              ) (tr_docs contacts)
+  -- printDebug "paired" (HM.keys paired)
+  pure paired
+-- POC here, should be a probabilistic function (see the one used to find lang)
+toName :: Node HyperdataContact -> NgramsTerm
+-- toName contact = NgramsTerm $ (Text.toTitle $ Text.take 1 firstName) <> ". " <> (Text.toTitle lastName)
+toName contact = NgramsTerm $ (Text.toTitle firstName) <> " " <> (Text.toTitle lastName)
    where
-    selectNgramsTindexed :: CorpusId
-                         -> NgramsType
-                         -> Cmd err [(NgramsId, Terms, Int)]
-    selectNgramsTindexed corpusId' ngramsType'' = runPGSQuery selectQuery (corpusId', ngramsTypeId ngramsType'')
-      where
-        selectQuery = [sql| SELECT n.id,n.terms,n.n from ngrams n
-                      JOIN node_node_ngrams occ ON occ.ngrams_id = n.id
-                      -- JOIN node_node_ngrams2 occ ON occ.ngrams_id = n.id
-                      JOIN nodes_nodes      nn  ON nn.node2_id   = occ.node2_id
-
-                      WHERE nn.node1_id     = ?
-                        AND occ.ngrams_type = ?
-                        AND occ.node2_id = nn.node2_id
-                      GROUP BY n.id;
-                     |]
-
-{- | TODO more typed SQL queries
-selectNgramsTindexed :: CorpusId -> NgramsType -> Query NgramsRead
-selectNgramsTindexed corpusId ngramsType = proc () -> do
-    nodeNode   <- queryNodeNodeTable     -< ()
-    nodeNgrams <- queryNodesNgramsTable  -< ()
-    ngrams     <- queryNgramsTable       -< ()
-
-    restrict -< node1_id nodeNode .== pgInt4 corpusId
-    restrict -< node2_id nodeNode .== node_id nodeNgrams
-    restrict -< ngrams_id ngrams  .== node_ngrams nodeNgrams
-
-    result <- aggregate groupBy (ngrams_id ngrams)
-    returnA -< result
---}
+    firstName = fromMaybe "" $ contact^.(node_hyperdata . hc_who . _Just . cw_firstName)
+    lastName  = fromMaybe "" $ contact^.(node_hyperdata . hc_who . _Just . cw_lastName)
+
+getNgramsDocId :: CorpusId
+                -> ListId
+                -> NgramsType
+                -> GargNoServer (HashMap DocAuthor (Set NodeId))
+getNgramsDocId cId lId nt = do
+  lIds <- selectNodesWithUsername NodeList userMaster
+  repo <- getRepo (lId:lIds)
+  let ngs = filterListWithRoot [MapTerm, CandidateTerm] $ mapTermListRoot (lId:lIds) nt repo
+  -- printDebug "getNgramsDocId" ngs
+
+  groupNodesByNgrams ngs <$> getContextsByNgramsOnlyUser cId (lIds <> [lId]) nt (HashMap.keys ngs)
+
+hashmapReverse :: (Ord a, Eq b, Hashable b)
+        => HashMap a (Set b) -> HashMap b (Set a)
+hashmapReverse m = HM.fromListWith (<>)
+          $ List.concat
+          $ map (\(k,vs) -> [ (v, Set.singleton k) | v <- Set.toList vs])
+          $ HM.toList m