2 Module : Gargantext.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
13 module Gargantext.Text.List
16 -- import Data.Either (partitionEithers, Either(..))
19 import Data.Text (Text)
20 import Gargantext.API.Ngrams (NgramsElement, mkNgramsElement, RootParent(..), mSetFromList)
21 -- import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
22 import Gargantext.Core (Lang(..))
23 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId, Ordering(..))
24 import Gargantext.Database.Action.Metrics.NgramsByNode (getTficf, ngramsGroup, getNodesByNgramsUser, groupNodesByNgramsWith)
25 import Gargantext.Text.Metrics.TFICF (sortTficf)
26 import Gargantext.Database.Prelude (Cmd)
27 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
28 import Gargantext.Prelude
29 import Gargantext.Text.List.Learn (Model(..))
30 -- import Gargantext.Text.Metrics (takeScored)
31 import qualified Data.Char as Char
32 import qualified Data.List as List
33 import qualified Data.Map as Map
34 import qualified Data.Set as Set
35 import qualified Data.Text as Text
38 data NgramsListBuilder = BuilderStepO { stemSize :: Int
42 | BuilderStep1 { withModel :: Model }
43 | BuilderStepN { withModel :: Model }
44 | Tficf { nlb_lang :: Lang
47 , nlb_stopSize :: StopSize
48 , nlb_userCorpusId :: UserCorpusId
49 , nlb_masterCorpusId :: MasterCorpusId
53 data StopSize = StopSize {unStopSize :: Int}
55 -- | TODO improve grouping functions of Authors, Sources, Institutes..
56 buildNgramsLists :: Lang
62 -> Cmd err (Map NgramsType [NgramsElement])
63 buildNgramsLists l n m s uCid mCid = do
64 ngTerms <- buildNgramsTermsList l n m s uCid mCid
65 othersTerms <- mapM (buildNgramsOthersList uCid identity) [Authors, Sources, Institutes]
66 pure $ Map.unions $ othersTerms <> [ngTerms]
69 buildNgramsOthersList :: UserCorpusId
72 -> Cmd err (Map NgramsType [NgramsElement])
73 buildNgramsOthersList uCid groupIt nt = do
74 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid nt
78 all' = List.reverse $ List.sortOn (Set.size . snd . snd) $ Map.toList ngs
80 graphTerms = List.take listSize all'
81 candiTerms = List.drop listSize all'
83 pure $ Map.unionsWith (<>) [ toElements MapTerm graphTerms
84 , toElements CandidateTerm candiTerms
88 Map.fromList [(nt, [ mkNgramsElement t nType Nothing (mSetFromList [])
94 buildNgramsTermsList' :: UserCorpusId
96 -> ((Text, (Set Text, Set NodeId)) -> Bool)
99 -> Cmd err (Map NgramsType [NgramsElement])
101 buildNgramsTermsList' uCid groupIt stop gls is = do
102 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid NgramsTerms
105 (stops, candidates) = partitionEithers
106 $ map (\t -> if stop t then Left t else Right t)
108 $ Map.filter ((\s' -> Set.size s' > 1) . snd) ngs
110 (maps, candidates') = takeScored gls is
111 $ getCoocByNgrams' snd (Diagonal True)
112 $ Map.fromList candidates
115 toList' t = (fst t, (fromIntegral $ Set.size $ snd $ snd t, fst $ snd t))
118 , List.filter (\(k,_) -> List.elem k candidates') candidates
119 , List.filter (\(k,_) -> List.elem k maps) candidates
122 let ngs' = List.concat
123 $ map toNgramsElement
124 $ map (\t -> (StopTerm , toList' t)) s
125 <> map (\t -> (CandidateTerm, toList' t)) c
126 <> map (\t -> (MapTerm , toList' t)) m
128 pure $ Map.fromList [(NgramsTerms, ngs')]
134 buildNgramsTermsList :: Lang
140 -> Cmd err (Map NgramsType [NgramsElement])
141 buildNgramsTermsList l n m s uCid mCid = do
142 candidates <- sortTficf Down <$> getTficf uCid mCid NgramsTerms (ngramsGroup l n m)
150 candidatesHead = List.take candidatesSize candidates
151 candidatesTail = List.drop candidatesSize candidates
154 -- (toTermList a b ((isStopTerm s) . fst) candidatesHead)
155 (map (toGargList ((isStopTerm s) .fst) MapTerm) candidatesHead)
156 <> (map (toGargList ((isStopTerm s) .fst) CandidateTerm) candidatesTail)
158 ngs = List.concat $ map toNgramsElement termList
160 pure $ Map.fromList [(NgramsTerms, ngs)]
168 toTermList _ _ _ [] = []
169 toTermList a b stop ns = -- trace ("computing toTermList") $
170 map (toGargList stop CandidateTerm) xs
171 <> map (toGargList stop MapTerm) ys
172 <> toTermList a b stop zs
181 toNgramsElement :: (ListType, (Text, (Double, Set Text))) -> [NgramsElement]
182 toNgramsElement (listType, (_stem, (_score, setNgrams))) =
183 case Set.toList setNgrams of
185 (parent:children) -> [parentElem] <> childrenElems
187 parentElem = mkNgramsElement parent
190 (mSetFromList children)
191 childrenElems = map (\t -> mkNgramsElement t listType
192 (Just $ RootParent parent parent)
197 toGargList :: (b -> Bool) -> ListType -> b -> (ListType, b)
198 toGargList stop l n = case stop n of
199 True -> (StopTerm, n)
203 isStopTerm :: StopSize -> Text -> Bool
204 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
206 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)