2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
13 module Gargantext.Core.Text.List
16 -- import Data.Either (partitionEithers, Either(..))
19 import Data.Text (Text)
20 import qualified Data.Char as Char
21 import qualified Data.List as List
22 import qualified Data.Map as Map
23 import qualified Data.Set as Set
24 import qualified Data.Text as Text
26 import Gargantext.API.Ngrams (NgramsElement, mkNgramsElement, NgramsTerm(..), RootParent(..), mSetFromList)
27 -- import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
28 import Gargantext.Core (Lang(..))
29 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId, Ordering(..))
30 import Gargantext.Database.Action.Metrics.NgramsByNode (ngramsGroup, getNodesByNgramsUser, groupNodesByNgramsWith)
31 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
32 import Gargantext.Core.Text.Metrics.TFICF (sortTficf)
33 import Gargantext.Database.Prelude (Cmd)
34 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
36 import Gargantext.Prelude
37 import Gargantext.Core.Text (size)
38 import Gargantext.Core.Text.List.Learn (Model(..))
39 -- import Gargantext.Core.Text.Metrics (takeScored)
42 data NgramsListBuilder = BuilderStepO { stemSize :: Int
46 | BuilderStep1 { withModel :: Model }
47 | BuilderStepN { withModel :: Model }
48 | Tficf { nlb_lang :: Lang
51 , nlb_stopSize :: StopSize
52 , nlb_userCorpusId :: UserCorpusId
53 , nlb_masterCorpusId :: MasterCorpusId
57 data StopSize = StopSize {unStopSize :: Int}
59 -- | TODO improve grouping functions of Authors, Sources, Institutes..
60 buildNgramsLists :: Lang
66 -> Cmd err (Map NgramsType [NgramsElement])
67 buildNgramsLists l n m s uCid mCid = do
68 ngTerms <- buildNgramsTermsList l n m s uCid mCid
69 othersTerms <- mapM (buildNgramsOthersList uCid identity)
70 [Authors, Sources, Institutes]
71 pure $ Map.unions $ othersTerms <> [ngTerms]
74 buildNgramsOthersList :: UserCorpusId
77 -> Cmd err (Map NgramsType [NgramsElement])
78 buildNgramsOthersList uCid groupIt nt = do
79 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid nt
84 $ List.sortOn (Set.size . snd . snd)
87 graphTerms = List.take listSize all'
88 candiTerms = List.drop listSize all'
90 pure $ Map.unionsWith (<>) [ toElements MapTerm graphTerms
91 , toElements CandidateTerm candiTerms
95 Map.fromList [(nt, [ mkNgramsElement (NgramsTerm t) nType Nothing (mSetFromList [])
100 buildNgramsTermsList :: Lang
106 -> Cmd err (Map NgramsType [NgramsElement])
107 buildNgramsTermsList l n m s uCid mCid = do
108 candidates <- sortTficf Up <$> getTficf uCid mCid NgramsTerms
109 -- printDebug "head candidates" (List.take 10 $ candidates)
110 -- printDebug "tail candidates" (List.take 10 $ List.reverse $ candidates)
113 listSize = 400 :: Double
114 (candidatesHead, candidatesTail0) = List.splitAt 3 candidates
116 (mono, multi) = List.partition (\t -> (size . fst) t < 2) candidatesTail0
117 (monoHead , monoTail ) = List.splitAt (round $ 0.60 * listSize) mono
118 (multiHead, multiTail) = List.splitAt (round $ 0.40 * listSize) multi
120 termList = (map (toGargList ((isStopTerm s) . fst) CandidateTerm) candidatesHead)
121 <> (map (toGargList ((isStopTerm s) . fst) MapTerm) (monoHead <> multiHead))
122 <> (map (toGargList ((isStopTerm s) . fst) CandidateTerm) (monoTail <> multiTail))
125 $ map toNgramsElement
127 $ map (\(listType, (t,d)) -> ( ngramsGroup l n m t
128 , GroupedText listType t d Set.empty
132 pure $ Map.fromList [(NgramsTerms, ngs)]
134 type Group = Lang -> Int -> Int -> Text -> Text
137 data GroupedText = GroupedText { _gt_listType :: ListType
139 , _gt_score :: Double
140 , _gt_group :: Set Text
142 groupStems :: [(Stem, GroupedText)] -> [GroupedText]
143 groupStems = Map.elems . Map.fromListWith grouping
145 grouping (GroupedText lt1 label1 score1 group1)
146 (GroupedText lt2 label2 score2 group2)
147 | score1 >= score2 = GroupedText lt label1 score1 (Set.insert label2 gr)
148 | otherwise = GroupedText lt label2 score2 (Set.insert label1 gr)
151 gr = Set.union group1 group2
153 toNgramsElement :: GroupedText -> [NgramsElement]
154 toNgramsElement (GroupedText listType label _ setNgrams) =
155 [parentElem] <> childrenElems
158 children = Set.toList setNgrams
159 parentElem = mkNgramsElement (NgramsTerm parent)
162 (mSetFromList (NgramsTerm <$> children))
163 childrenElems = map (\t -> mkNgramsElement t listType
164 (Just $ RootParent (NgramsTerm parent) (NgramsTerm parent))
166 ) (NgramsTerm <$> children)
169 toGargList :: (b -> Bool) -> ListType -> b -> (ListType, b)
170 toGargList isStop l n = case isStop n of
171 True -> (StopTerm, n)
175 isStopTerm :: StopSize -> Text -> Bool
176 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
178 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)