src/Gargantext/Core/Text/Group.hs

   1 {-|
   2 Module      : Gargantext.Core.Text.Group
   3 Description :
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 -}
  11
  12 {-# LANGUAGE TemplateHaskell   #-}
  13
  14 module Gargantext.Core.Text.Group
  15   where
  16
  17 import Control.Lens (makeLenses, set)
  18 import Data.Set (Set)
  19 import Data.Map (Map)
  20 import Data.Text (Text)
  21 import Gargantext.Core (Lang(..))
  22 import Gargantext.Core.Text (size)
  23 import Gargantext.Core.Types (ListType(..)) -- (MasterCorpusId, UserCorpusId)
  24 import Gargantext.Database.Admin.Types.Node (NodeId)
  25 -- import Gargantext.Core.Text.List.Learn (Model(..))
  26 import Gargantext.Core.Text.Terms.Mono.Stem (stem)
  27 import Gargantext.Prelude
  28 import qualified Data.Set  as Set
  29 import qualified Data.Map  as Map
  30 import qualified Data.List as List
  31 import qualified Data.Text as Text
  32
  33 {-
  34 data NgramsListBuilder = BuilderStepO { stemSize :: !Int
  35                                       , stemX    :: !Int
  36                                       , stopSize :: !StopSize
  37                                       }
  38                        | BuilderStep1 { withModel :: !Model }
  39                        | BuilderStepN { withModel :: !Model }
  40                        | Tficf { nlb_lang           :: !Lang
  41                                , nlb_group1         :: !Int
  42                                , nlb_group2         :: !Int
  43                                , nlb_stopSize       :: !StopSize
  44                                , nlb_userCorpusId   :: !UserCorpusId
  45                                , nlb_masterCorpusId :: !MasterCorpusId
  46                                }
  47 -}
  48
  49 data StopSize = StopSize {unStopSize :: !Int}
  50
  51 -- | TODO: group with 2 terms only can be
  52 -- discussed. Main purpose of this is offering
  53 -- a first grouping option to user and get some
  54 -- enriched data to better learn and improve that algo
  55 data GroupParams = GroupParams { unGroupParams_lang  :: !Lang
  56                                , unGroupParams_len   :: !Int
  57                                , unGroupParams_limit :: !Int
  58                                , unGroupParams_stopSize :: !StopSize
  59                                }
  60                  | GroupIdentity
  61
  62 ngramsGroup :: GroupParams
  63             -> Text
  64             -> Text
  65 ngramsGroup GroupIdentity  = identity
  66 ngramsGroup (GroupParams l _m _n _) = Text.intercalate " "
  67                   . map (stem l)
  68                   -- . take n
  69                   . List.sort
  70                   -- . (List.filter (\t -> Text.length t > m))
  71                   . Text.splitOn " "
  72                   . Text.replace "-" " "
  73
  74 ------------------------------------------------------------------------
  75 mergeMapParent :: Map Text (GroupedText b)
  76                -> Map Text (Map Text Int)
  77                -> Map Text (GroupedText b)
  78 mergeMapParent = undefined
  79
  80 ------------------------------------------------------------------------
  81 toGroupedText :: Ord b
  82               => (Text -> Text)
  83               -> (a -> b)
  84               -> (a -> Set Text)
  85               -> (a -> Set NodeId)
  86               -> [(Text,a)]
  87               -> Map Stem (GroupedText b)
  88 toGroupedText fun_stem fun_score fun_texts fun_nodeIds from = groupStems' $ map group from
  89   where
  90     group (t,d) = let t' = fun_stem t
  91                    in (t', GroupedText
  92                               Nothing
  93                               t
  94                               (fun_score d)
  95                               (fun_texts d)
  96                               (size t)
  97                               t'
  98                               (fun_nodeIds d)
  99                        )
 100
 101 groupStems :: [(Stem, GroupedText Double)] -> [GroupedText Double]
 102 groupStems = Map.elems . groupStems'
 103
 104 groupStems' :: Ord a => [(Stem, GroupedText a)] -> Map Stem (GroupedText a)
 105 groupStems' = Map.fromListWith grouping
 106   where
 107     grouping (GroupedText lt1 label1 score1 group1 s1 stem1 nodes1)
 108              (GroupedText lt2 label2 score2 group2 s2 stem2 nodes2)
 109              | score1 >= score2 = GroupedText lt label1 score1 (Set.insert label2 gr) s1 stem1 nodes
 110              | otherwise        = GroupedText lt label2 score2 (Set.insert label1 gr) s2 stem2 nodes
 111         where
 112           lt = lt1 <> lt2
 113           gr    = Set.union group1 group2
 114           nodes = Set.union nodes1 nodes2
 115
 116 ------------------------------------------------------------------------
 117 type Group = Lang -> Int -> Int -> Text -> Text
 118 type Stem  = Text
 119 type Label = Text
 120 data GroupedText score =
 121   GroupedText { _gt_listType :: !(Maybe ListType)
 122               , _gt_label    :: !Label
 123               , _gt_score    :: !score
 124               , _gt_children :: !(Set Text)
 125               , _gt_size     :: !Int
 126               , _gt_stem     :: !Stem
 127               , _gt_nodes    :: !(Set NodeId)
 128               } {-deriving Show--}
 129 --{-
 130 instance Show score => Show (GroupedText score) where
 131   show (GroupedText lt l s _ _ _ _) = show l <> " : " <> show lt <> " : " <> show s
 132 --}
 133
 134 instance (Eq a) => Eq (GroupedText a) where
 135   (==) (GroupedText _ _ score1 _ _ _ _)
 136        (GroupedText _ _ score2 _ _ _ _) = (==) score1 score2
 137
 138 instance (Eq a, Ord a) => Ord (GroupedText a) where
 139   compare (GroupedText _ _ score1 _ _ _ _)
 140           (GroupedText _ _ score2 _ _ _ _) = compare score1 score2
 141
 142 -- Lenses Instances
 143 makeLenses 'GroupedText
 144
 145 ------------------------------------------------------------------------
 146 addListType :: Map Text ListType -> GroupedText a -> GroupedText a
 147 addListType m g = set gt_listType (hasListType m g) g
 148   where
 149     hasListType :: Map Text ListType -> GroupedText a -> Maybe ListType
 150     hasListType m' (GroupedText _ label _ g' _ _ _) =
 151         List.foldl' (<>) Nothing
 152       $ map (\t -> Map.lookup t m')
 153       $ Set.toList
 154       $ Set.insert label g'
 155
 156