2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
13 module Gargantext.Core.Text.List
16 -- import Data.Either (partitionEithers, Either(..))
19 import Data.Text (Text)
20 import qualified Data.Char as Char
21 import qualified Data.List as List
22 import qualified Data.Map as Map
23 import qualified Data.Set as Set
24 import qualified Data.Text as Text
26 import Gargantext.Prelude
28 import Gargantext.API.Ngrams (NgramsElement, mkNgramsElement, NgramsTerm(..), RootParent(..), mSetFromList)
29 -- import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
30 import Gargantext.Core (Lang(..))
31 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId, Ordering(..))
32 import Gargantext.Database.Action.Metrics.NgramsByNode (ngramsGroup, getNodesByNgramsUser, groupNodesByNgramsWith)
33 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
34 import Gargantext.Core.Text.Metrics.TFICF (sortTficf)
35 import Gargantext.Database.Prelude (Cmd)
36 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
37 import Gargantext.Core.Text.List.Learn (Model(..))
38 -- import Gargantext.Core.Text.Metrics (takeScored)
41 data NgramsListBuilder = BuilderStepO { stemSize :: Int
45 | BuilderStep1 { withModel :: Model }
46 | BuilderStepN { withModel :: Model }
47 | Tficf { nlb_lang :: Lang
50 , nlb_stopSize :: StopSize
51 , nlb_userCorpusId :: UserCorpusId
52 , nlb_masterCorpusId :: MasterCorpusId
56 data StopSize = StopSize {unStopSize :: Int}
58 -- | TODO improve grouping functions of Authors, Sources, Institutes..
59 buildNgramsLists :: Lang
65 -> Cmd err (Map NgramsType [NgramsElement])
66 buildNgramsLists l n m s uCid mCid = do
67 ngTerms <- buildNgramsTermsList l n m s uCid mCid
68 othersTerms <- mapM (buildNgramsOthersList uCid identity) [Authors, Sources, Institutes]
69 pure $ Map.unions $ othersTerms <> [ngTerms]
72 buildNgramsOthersList :: UserCorpusId
75 -> Cmd err (Map NgramsType [NgramsElement])
76 buildNgramsOthersList uCid groupIt nt = do
77 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid nt
81 all' = List.reverse $ List.sortOn (Set.size . snd . snd) $ Map.toList ngs
83 graphTerms = List.take listSize all'
84 candiTerms = List.drop listSize all'
86 pure $ Map.unionsWith (<>) [ toElements MapTerm graphTerms
87 , toElements CandidateTerm candiTerms
91 Map.fromList [(nt, [ mkNgramsElement (NgramsTerm t) nType Nothing (mSetFromList [])
96 buildNgramsTermsList :: Lang
102 -> Cmd err (Map NgramsType [NgramsElement])
103 buildNgramsTermsList l n m s uCid mCid = do
104 candidates <- sortTficf Up <$> getTficf uCid mCid NgramsTerms
105 -- printDebug "head candidates" (List.take 10 $ candidates)
106 -- printDebug "tail candidates" (List.take 10 $ List.reverse $ candidates)
109 (candidatesHead, candidatesTail0) = List.splitAt 3 candidates
110 (candidatesMap, candidatesTailFinal) = List.splitAt 400 candidatesTail0
112 termList = (map (toGargList ((isStopTerm s) . fst) CandidateTerm) candidatesHead)
113 <> (map (toGargList ((isStopTerm s) . fst) MapTerm) candidatesMap)
114 <> (map (toGargList ((isStopTerm s) . fst) CandidateTerm) candidatesTailFinal)
117 $ map toNgramsElement
119 $ map (\(listType, (t,d)) -> ( ngramsGroup l n m t
120 , GroupedText listType t d Set.empty
124 pure $ Map.fromList [(NgramsTerms, ngs)]
126 type Group = Lang -> Int -> Int -> Text -> Text
129 data GroupedText = GroupedText { _gt_listType :: ListType
131 , _gt_score :: Double
132 , _gt_group :: Set Text
134 groupStems :: [(Stem, GroupedText)] -> [GroupedText]
135 groupStems = Map.elems . Map.fromListWith grouping
137 grouping (GroupedText lt1 label1 score1 group1)
138 (GroupedText lt2 label2 score2 group2)
139 | score1 >= score2 = GroupedText lt label1 score1 (Set.insert label2 gr)
140 | otherwise = GroupedText lt label2 score2 (Set.insert label1 gr)
143 gr = Set.union group1 group2
145 toNgramsElement :: GroupedText -> [NgramsElement]
146 toNgramsElement (GroupedText listType label _ setNgrams) =
147 [parentElem] <> childrenElems
150 children = Set.toList setNgrams
151 parentElem = mkNgramsElement (NgramsTerm parent)
154 (mSetFromList (NgramsTerm <$> children))
155 childrenElems = map (\t -> mkNgramsElement t listType
156 (Just $ RootParent (NgramsTerm parent) (NgramsTerm parent))
158 ) (NgramsTerm <$> children)
161 toGargList :: (b -> Bool) -> ListType -> b -> (ListType, b)
162 toGargList isStop l n = case isStop n of
163 True -> (StopTerm, n)
167 isStopTerm :: StopSize -> Text -> Bool
168 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
170 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)