2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
13 module Gargantext.Core.Text.List
16 -- import Data.Either (partitionEithers, Either(..))
19 import Data.Text (Text)
20 import Gargantext.API.Ngrams (NgramsElement, mkNgramsElement, RootParent(..), mSetFromList)
21 -- import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
22 import Gargantext.Core (Lang(..))
23 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId, Ordering(..))
24 import Gargantext.Database.Action.Metrics.NgramsByNode ({-ngramsGroup,-} getNodesByNgramsUser, groupNodesByNgramsWith)
25 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
26 import Gargantext.Core.Text.Metrics.TFICF (sortTficf)
27 import Gargantext.Database.Prelude (Cmd)
28 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
29 import Gargantext.Prelude
30 import Gargantext.Core.Text.List.Learn (Model(..))
31 -- import Gargantext.Core.Text.Metrics (takeScored)
32 import qualified Data.Char as Char
33 import qualified Data.List as List
34 import qualified Data.Map as Map
35 import qualified Data.Set as Set
36 import qualified Data.Text as Text
39 data NgramsListBuilder = BuilderStepO { stemSize :: Int
43 | BuilderStep1 { withModel :: Model }
44 | BuilderStepN { withModel :: Model }
45 | Tficf { nlb_lang :: Lang
48 , nlb_stopSize :: StopSize
49 , nlb_userCorpusId :: UserCorpusId
50 , nlb_masterCorpusId :: MasterCorpusId
54 data StopSize = StopSize {unStopSize :: Int}
56 -- | TODO improve grouping functions of Authors, Sources, Institutes..
57 buildNgramsLists :: Lang
63 -> Cmd err (Map NgramsType [NgramsElement])
64 buildNgramsLists l n m s uCid mCid = do
65 ngTerms <- buildNgramsTermsList l n m s uCid mCid
66 othersTerms <- mapM (buildNgramsOthersList uCid identity) [Authors, Sources, Institutes]
67 pure $ Map.unions $ othersTerms <> [ngTerms]
70 buildNgramsOthersList :: UserCorpusId
73 -> Cmd err (Map NgramsType [NgramsElement])
74 buildNgramsOthersList uCid groupIt nt = do
75 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid nt
79 all' = List.reverse $ List.sortOn (Set.size . snd . snd) $ Map.toList ngs
81 graphTerms = List.take listSize all'
82 candiTerms = List.drop listSize all'
84 pure $ Map.unionsWith (<>) [ toElements MapTerm graphTerms
85 , toElements CandidateTerm candiTerms
89 Map.fromList [(nt, [ mkNgramsElement t nType Nothing (mSetFromList [])
95 buildNgramsTermsList' :: UserCorpusId
97 -> ((Text, (Set Text, Set NodeId)) -> Bool)
100 -> Cmd err (Map NgramsType [NgramsElement])
102 buildNgramsTermsList' uCid groupIt stop gls is = do
103 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid NgramsTerms
106 (stops, candidates) = partitionEithers
107 $ map (\t -> if stop t then Left t else Right t)
109 $ Map.filter ((\s' -> Set.size s' > 1) . snd) ngs
111 (maps, candidates') = takeScored gls is
112 $ getCoocByNgrams' snd (Diagonal True)
113 $ Map.fromList candidates
116 toList' t = (fst t, (fromIntegral $ Set.size $ snd $ snd t, fst $ snd t))
119 , List.filter (\(k,_) -> List.elem k candidates') candidates
120 , List.filter (\(k,_) -> List.elem k maps) candidates
123 let ngs' = List.concat
124 $ map toNgramsElement
125 $ map (\t -> (StopTerm , toList' t)) s
126 <> map (\t -> (CandidateTerm, toList' t)) c
127 <> map (\t -> (MapTerm , toList' t)) m
129 pure $ Map.fromList [(NgramsTerms, ngs')]
135 buildNgramsTermsList :: Lang
141 -> Cmd err (Map NgramsType [NgramsElement])
142 buildNgramsTermsList _l _n _m s uCid mCid = do
143 candidates <- sortTficf Down <$> getTficf uCid mCid NgramsTerms
144 printDebug "head candidates" (List.take 10 $ candidates)
145 printDebug "tail candidates" (List.take 10 $ List.reverse $ candidates)
149 candidatesHead = List.take candidatesSize candidates
150 candidatesTail = List.drop candidatesSize candidates
153 -- (toTermList a b ((isStopTerm s) . fst) candidatesHead)
154 (map (toGargList ((isStopTerm s) .fst) MapTerm) candidatesHead)
155 <> (map (toGargList ((isStopTerm s) .fst) CandidateTerm) candidatesTail)
157 ngs = List.concat $ map toNgramsElement $ map (\(lt, (t,d)) -> (lt, ((t, (d,Set.singleton t))))) termList
159 pure $ Map.fromList [(NgramsTerms, ngs)]
167 toTermList _ _ _ [] = []
168 toTermList a b stop ns = -- trace ("computing toTermList") $
169 map (toGargList stop CandidateTerm) xs
170 <> map (toGargList stop MapTerm) ys
171 <> toTermList a b stop zs
180 toNgramsElement :: (ListType, (Text, (Double, Set Text))) -> [NgramsElement]
181 toNgramsElement (listType, (_stem, (_score, setNgrams))) =
182 case Set.toList setNgrams of
184 (parent:children) -> [parentElem] <> childrenElems
186 parentElem = mkNgramsElement parent
189 (mSetFromList children)
190 childrenElems = map (\t -> mkNgramsElement t listType
191 (Just $ RootParent parent parent)
196 toGargList :: (b -> Bool) -> ListType -> b -> (ListType, b)
197 toGargList stop l n = case stop n of
198 True -> (StopTerm, n)
202 isStopTerm :: StopSize -> Text -> Bool
203 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
205 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)