2 Module : Gargantext.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
13 module Gargantext.Text.List
16 -- import Data.Either (partitionEithers, Either(..))
19 import Data.Text (Text)
20 import Gargantext.API.Ngrams (NgramsElement, mkNgramsElement, RootParent(..), mSetFromList)
21 -- import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
22 import Gargantext.Core (Lang(..))
23 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId, Ordering(..))
24 import Gargantext.Database.Action.Metrics.NgramsByNode (getTficf, sortTficf, ngramsGroup, getNodesByNgramsUser, groupNodesByNgramsWith)
25 import Gargantext.Database.Prelude (Cmd)
26 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
27 import Gargantext.Prelude
28 import Gargantext.Text.List.Learn (Model(..))
29 -- import Gargantext.Text.Metrics (takeScored)
30 import qualified Data.Char as Char
31 import qualified Data.List as List
32 import qualified Data.Map as Map
33 import qualified Data.Set as Set
34 import qualified Data.Text as Text
37 data NgramsListBuilder = BuilderStepO { stemSize :: Int
41 | BuilderStep1 { withModel :: Model }
42 | BuilderStepN { withModel :: Model }
43 | Tficf { nlb_lang :: Lang
46 , nlb_stopSize :: StopSize
47 , nlb_userCorpusId :: UserCorpusId
48 , nlb_masterCorpusId :: MasterCorpusId
52 data StopSize = StopSize {unStopSize :: Int}
54 -- | TODO improve grouping functions of Authors, Sources, Institutes..
55 buildNgramsLists :: Lang
61 -> Cmd err (Map NgramsType [NgramsElement])
62 buildNgramsLists l n m s uCid mCid = do
63 ngTerms <- buildNgramsTermsList l n m s uCid mCid
64 othersTerms <- mapM (buildNgramsOthersList uCid identity) [Authors, Sources, Institutes]
65 pure $ Map.unions $ othersTerms <> [ngTerms]
68 buildNgramsOthersList :: UserCorpusId
71 -> Cmd err (Map NgramsType [NgramsElement])
72 buildNgramsOthersList uCid groupIt nt = do
73 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid nt
77 all' = List.reverse $ List.sortOn (Set.size . snd . snd) $ Map.toList ngs
79 graphTerms = List.take listSize all'
80 candiTerms = List.drop listSize all'
82 pure $ Map.unionsWith (<>) [ toElements MapTerm graphTerms
83 , toElements CandidateTerm candiTerms
87 Map.fromList [(nt, [ mkNgramsElement t nType Nothing (mSetFromList [])
93 buildNgramsTermsList' :: UserCorpusId
95 -> ((Text, (Set Text, Set NodeId)) -> Bool)
98 -> Cmd err (Map NgramsType [NgramsElement])
100 buildNgramsTermsList' uCid groupIt stop gls is = do
101 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid NgramsTerms
104 (stops, candidates) = partitionEithers
105 $ map (\t -> if stop t then Left t else Right t)
107 $ Map.filter ((\s' -> Set.size s' > 1) . snd) ngs
109 (maps, candidates') = takeScored gls is
110 $ getCoocByNgrams' snd (Diagonal True)
111 $ Map.fromList candidates
114 toList' t = (fst t, (fromIntegral $ Set.size $ snd $ snd t, fst $ snd t))
117 , List.filter (\(k,_) -> List.elem k candidates') candidates
118 , List.filter (\(k,_) -> List.elem k maps) candidates
121 let ngs' = List.concat
122 $ map toNgramsElement
123 $ map (\t -> (StopTerm , toList' t)) s
124 <> map (\t -> (CandidateTerm, toList' t)) c
125 <> map (\t -> (MapTerm , toList' t)) m
127 pure $ Map.fromList [(NgramsTerms, ngs')]
133 buildNgramsTermsList :: Lang
139 -> Cmd err (Map NgramsType [NgramsElement])
140 buildNgramsTermsList l n m s uCid mCid = do
141 candidates <- sortTficf Down <$> getTficf uCid mCid NgramsTerms (ngramsGroup l n m)
149 candidatesHead = List.take candidatesSize candidates
150 candidatesTail = List.drop candidatesSize candidates
153 -- (toTermList a b ((isStopTerm s) . fst) candidatesHead)
154 (map (toGargList ((isStopTerm s) .fst) MapTerm) candidatesHead)
155 <> (map (toGargList ((isStopTerm s) .fst) CandidateTerm) candidatesTail)
157 ngs = List.concat $ map toNgramsElement termList
159 pure $ Map.fromList [(NgramsTerms, ngs)]
167 toTermList _ _ _ [] = []
168 toTermList a b stop ns = -- trace ("computing toTermList") $
169 map (toGargList stop CandidateTerm) xs
170 <> map (toGargList stop MapTerm) ys
171 <> toTermList a b stop zs
180 toNgramsElement :: (ListType, (Text, (Double, Set Text))) -> [NgramsElement]
181 toNgramsElement (listType, (_stem, (_score, setNgrams))) =
182 case Set.toList setNgrams of
184 (parent:children) -> [parentElem] <> childrenElems
186 parentElem = mkNgramsElement parent
189 (mSetFromList children)
190 childrenElems = map (\t -> mkNgramsElement t listType
191 (Just $ RootParent parent parent)
196 toGargList :: (b -> Bool) -> ListType -> b -> (ListType, b)
197 toGargList stop l n = case stop n of
198 True -> (StopTerm, n)
203 isStopTerm :: StopSize -> Text -> Bool
204 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
206 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)