2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
13 module Gargantext.Core.Text.List
16 -- import Data.Either (partitionEithers, Either(..))
19 import Data.Text (Text)
20 import Gargantext.API.Ngrams (NgramsElement, mkNgramsElement, RootParent(..), mSetFromList)
21 -- import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
22 import Gargantext.Core (Lang(..))
23 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId, Ordering(..))
24 import Gargantext.Database.Action.Metrics.NgramsByNode ({-ngramsGroup,-} getNodesByNgramsUser, groupNodesByNgramsWith)
25 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
26 import Gargantext.Core.Text.Metrics.TFICF (sortTficf)
27 import Gargantext.Database.Prelude (Cmd)
28 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
29 import Gargantext.Prelude
30 import Gargantext.Core.Text.List.Learn (Model(..))
31 -- import Gargantext.Core.Text.Metrics (takeScored)
32 import qualified Data.Char as Char
33 import qualified Data.List as List
34 import qualified Data.Map as Map
35 import qualified Data.Set as Set
36 import qualified Data.Text as Text
39 data NgramsListBuilder = BuilderStepO { stemSize :: Int
43 | BuilderStep1 { withModel :: Model }
44 | BuilderStepN { withModel :: Model }
45 | Tficf { nlb_lang :: Lang
48 , nlb_stopSize :: StopSize
49 , nlb_userCorpusId :: UserCorpusId
50 , nlb_masterCorpusId :: MasterCorpusId
54 data StopSize = StopSize {unStopSize :: Int}
56 -- | TODO improve grouping functions of Authors, Sources, Institutes..
57 buildNgramsLists :: Lang
63 -> Cmd err (Map NgramsType [NgramsElement])
64 buildNgramsLists l n m s uCid mCid = do
65 ngTerms <- buildNgramsTermsList l n m s uCid mCid
66 othersTerms <- mapM (buildNgramsOthersList uCid identity) [Authors, Sources, Institutes]
67 pure $ Map.unions $ othersTerms <> [ngTerms]
70 buildNgramsOthersList :: UserCorpusId
73 -> Cmd err (Map NgramsType [NgramsElement])
74 buildNgramsOthersList uCid groupIt nt = do
75 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid nt
79 all' = List.reverse $ List.sortOn (Set.size . snd . snd) $ Map.toList ngs
81 graphTerms = List.take listSize all'
82 candiTerms = List.drop listSize all'
84 pure $ Map.unionsWith (<>) [ toElements MapTerm graphTerms
85 , toElements CandidateTerm candiTerms
89 Map.fromList [(nt, [ mkNgramsElement t nType Nothing (mSetFromList [])
94 buildNgramsTermsList :: Lang
100 -> Cmd err (Map NgramsType [NgramsElement])
101 buildNgramsTermsList _l _n _m s uCid mCid = do
102 candidates <- sortTficf Up <$> getTficf uCid mCid NgramsTerms
103 printDebug "head candidates" (List.take 10 $ candidates)
104 printDebug "tail candidates" (List.take 10 $ List.reverse $ candidates)
107 (candidatesHead, candidatesTail0) = List.splitAt 3 candidates
108 (candidatesMap, candidatesTailFinal) = List.splitAt 400 candidatesTail0
110 termList = (map (toGargList ((isStopTerm s) . fst) CandidateTerm) candidatesHead)
111 <> (map (toGargList ((isStopTerm s) . fst) MapTerm) candidatesMap)
112 <> (map (toGargList ((isStopTerm s) . fst) CandidateTerm) candidatesTailFinal)
115 $ map toNgramsElement
116 $ map (\(lt, (t,d)) -> (lt, ((t, (d,Set.singleton t))))) termList
118 pure $ Map.fromList [(NgramsTerms, ngs)]
126 toTermList _ _ _ [] = []
127 toTermList a b stop ns = -- trace ("computing toTermList") $
128 map (toGargList stop CandidateTerm) xs
129 <> map (toGargList stop MapTerm) ys
130 <> toTermList a b stop zs
139 toNgramsElement :: (ListType, (Text, (Double, Set Text))) -> [NgramsElement]
140 toNgramsElement (listType, (_stem, (_score, setNgrams))) =
141 case Set.toList setNgrams of
143 (parent:children) -> [parentElem] <> childrenElems
145 parentElem = mkNgramsElement parent
148 (mSetFromList children)
149 childrenElems = map (\t -> mkNgramsElement t listType
150 (Just $ RootParent parent parent)
155 toGargList :: (b -> Bool) -> ListType -> b -> (ListType, b)
156 toGargList isStop l n = case isStop n of
157 True -> (StopTerm, n)
161 isStopTerm :: StopSize -> Text -> Bool
162 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
164 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)