]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/List.hs
[DEPS] stack upgrade
[gargantext.git] / src / Gargantext / Core / Text / List.hs
1 {-|
2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12
13 module Gargantext.Core.Text.List
14 where
15
16 -- import Data.Either (partitionEithers, Either(..))
17 import Data.Map (Map)
18 import Data.Set (Set)
19 import Data.Text (Text)
20 import Gargantext.API.Ngrams (NgramsElement, mkNgramsElement, RootParent(..), mSetFromList)
21 -- import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
22 import Gargantext.Core (Lang(..))
23 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId, Ordering(..))
24 import Gargantext.Database.Action.Metrics.NgramsByNode (ngramsGroup, getNodesByNgramsUser, groupNodesByNgramsWith)
25 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
26 import Gargantext.Core.Text.Metrics.TFICF (sortTficf)
27 import Gargantext.Database.Prelude (Cmd)
28 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
29 import Gargantext.Prelude
30 import Gargantext.Core.Text (size)
31 import Gargantext.Core.Text.List.Learn (Model(..))
32 -- import Gargantext.Core.Text.Metrics (takeScored)
33 import qualified Data.Char as Char
34 import qualified Data.List as List
35 import qualified Data.Map as Map
36 import qualified Data.Set as Set
37 import qualified Data.Text as Text
38
39
40 data NgramsListBuilder = BuilderStepO { stemSize :: Int
41 , stemX :: Int
42 , stopSize :: Int
43 }
44 | BuilderStep1 { withModel :: Model }
45 | BuilderStepN { withModel :: Model }
46 | Tficf { nlb_lang :: Lang
47 , nlb_group1 :: Int
48 , nlb_group2 :: Int
49 , nlb_stopSize :: StopSize
50 , nlb_userCorpusId :: UserCorpusId
51 , nlb_masterCorpusId :: MasterCorpusId
52 }
53
54
55 data StopSize = StopSize {unStopSize :: Int}
56
57 -- | TODO improve grouping functions of Authors, Sources, Institutes..
58 buildNgramsLists :: Lang
59 -> Int
60 -> Int
61 -> StopSize
62 -> UserCorpusId
63 -> MasterCorpusId
64 -> Cmd err (Map NgramsType [NgramsElement])
65 buildNgramsLists l n m s uCid mCid = do
66 ngTerms <- buildNgramsTermsList l n m s uCid mCid
67 othersTerms <- mapM (buildNgramsOthersList uCid identity)
68 [Authors, Sources, Institutes]
69 pure $ Map.unions $ othersTerms <> [ngTerms]
70
71
72 buildNgramsOthersList :: UserCorpusId
73 -> (Text -> Text)
74 -> NgramsType
75 -> Cmd err (Map NgramsType [NgramsElement])
76 buildNgramsOthersList uCid groupIt nt = do
77 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid nt
78
79 let
80 listSize = 9
81 all' = List.reverse
82 $ List.sortOn (Set.size . snd . snd)
83 $ Map.toList ngs
84
85 graphTerms = List.take listSize all'
86 candiTerms = List.drop listSize all'
87
88 pure $ Map.unionsWith (<>) [ toElements MapTerm graphTerms
89 , toElements CandidateTerm candiTerms
90 ]
91 where
92 toElements nType x =
93 Map.fromList [(nt, [ mkNgramsElement t nType Nothing (mSetFromList [])
94 | (t,_ns) <- x
95 ]
96 )]
97
98 buildNgramsTermsList :: Lang
99 -> Int
100 -> Int
101 -> StopSize
102 -> UserCorpusId
103 -> MasterCorpusId
104 -> Cmd err (Map NgramsType [NgramsElement])
105 buildNgramsTermsList l n m s uCid mCid = do
106 candidates <- sortTficf Up <$> getTficf uCid mCid NgramsTerms
107 -- printDebug "head candidates" (List.take 10 $ candidates)
108 -- printDebug "tail candidates" (List.take 10 $ List.reverse $ candidates)
109
110 let
111 listSize = 400 :: Double
112 (candidatesHead, candidatesTail0) = List.splitAt 3 candidates
113
114 (mono, multi) = List.partition (\t -> (size . fst) t < 2) candidatesTail0
115 (monoHead , monoTail ) = List.splitAt (round $ 0.60 * listSize) mono
116 (multiHead, multiTail) = List.splitAt (round $ 0.40 * listSize) multi
117
118 termList = (map (toGargList ((isStopTerm s) . fst) CandidateTerm) candidatesHead)
119 <> (map (toGargList ((isStopTerm s) . fst) MapTerm) (monoHead <> multiHead))
120 <> (map (toGargList ((isStopTerm s) . fst) CandidateTerm) (monoTail <> multiTail))
121
122 ngs = List.concat
123 $ map toNgramsElement
124 $ groupStems
125 $ map (\(listType, (t,d)) -> ( ngramsGroup l n m t
126 , GroupedText listType t d Set.empty
127 )
128 ) termList
129
130 pure $ Map.fromList [(NgramsTerms, ngs)]
131
132 type Group = Lang -> Int -> Int -> Text -> Text
133 type Stem = Text
134 type Label = Text
135 data GroupedText = GroupedText { _gt_listType :: ListType
136 , _gt_label :: Label
137 , _gt_score :: Double
138 , _gt_group :: Set Text
139 }
140 groupStems :: [(Stem, GroupedText)] -> [GroupedText]
141 groupStems = Map.elems . Map.fromListWith grouping
142 where
143 grouping (GroupedText lt1 label1 score1 group1)
144 (GroupedText lt2 label2 score2 group2)
145 | score1 >= score2 = GroupedText lt label1 score1 (Set.insert label2 gr)
146 | otherwise = GroupedText lt label2 score2 (Set.insert label1 gr)
147 where
148 lt = lt1 <> lt2
149 gr = Set.union group1 group2
150
151 toNgramsElement :: GroupedText -> [NgramsElement]
152 toNgramsElement (GroupedText listType label _ setNgrams) =
153 [parentElem] <> childrenElems
154 where
155 parent = label
156 children = Set.toList setNgrams
157 parentElem = mkNgramsElement parent
158 listType
159 Nothing
160 (mSetFromList children)
161 childrenElems = map (\t -> mkNgramsElement t listType
162 (Just $ RootParent parent parent)
163 (mSetFromList [])
164 ) children
165
166
167 toGargList :: (b -> Bool) -> ListType -> b -> (ListType, b)
168 toGargList isStop l n = case isStop n of
169 True -> (StopTerm, n)
170 False -> (l, n)
171
172
173 isStopTerm :: StopSize -> Text -> Bool
174 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
175 where
176 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)