]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/List.hs
[ngrams] NgramsTerm as a newtype
[gargantext.git] / src / Gargantext / Core / Text / List.hs
1 {-|
2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12
13 module Gargantext.Core.Text.List
14 where
15
16 -- import Data.Either (partitionEithers, Either(..))
17 import Data.Map (Map)
18 import Data.Set (Set)
19 import Data.Text (Text)
20 import qualified Data.Char as Char
21 import qualified Data.List as List
22 import qualified Data.Map as Map
23 import qualified Data.Set as Set
24 import qualified Data.Text as Text
25
26 import Gargantext.Prelude
27
28 import Gargantext.API.Ngrams (NgramsElement, mkNgramsElement, NgramsTerm(..), RootParent(..), mSetFromList)
29 -- import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
30 import Gargantext.Core (Lang(..))
31 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId, Ordering(..))
32 import Gargantext.Database.Action.Metrics.NgramsByNode (ngramsGroup, getNodesByNgramsUser, groupNodesByNgramsWith)
33 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
34 import Gargantext.Core.Text.Metrics.TFICF (sortTficf)
35 import Gargantext.Database.Prelude (Cmd)
36 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
37 import Gargantext.Core.Text.List.Learn (Model(..))
38 -- import Gargantext.Core.Text.Metrics (takeScored)
39
40
41 data NgramsListBuilder = BuilderStepO { stemSize :: Int
42 , stemX :: Int
43 , stopSize :: Int
44 }
45 | BuilderStep1 { withModel :: Model }
46 | BuilderStepN { withModel :: Model }
47 | Tficf { nlb_lang :: Lang
48 , nlb_group1 :: Int
49 , nlb_group2 :: Int
50 , nlb_stopSize :: StopSize
51 , nlb_userCorpusId :: UserCorpusId
52 , nlb_masterCorpusId :: MasterCorpusId
53 }
54
55
56 data StopSize = StopSize {unStopSize :: Int}
57
58 -- | TODO improve grouping functions of Authors, Sources, Institutes..
59 buildNgramsLists :: Lang
60 -> Int
61 -> Int
62 -> StopSize
63 -> UserCorpusId
64 -> MasterCorpusId
65 -> Cmd err (Map NgramsType [NgramsElement])
66 buildNgramsLists l n m s uCid mCid = do
67 ngTerms <- buildNgramsTermsList l n m s uCid mCid
68 othersTerms <- mapM (buildNgramsOthersList uCid identity) [Authors, Sources, Institutes]
69 pure $ Map.unions $ othersTerms <> [ngTerms]
70
71
72 buildNgramsOthersList :: UserCorpusId
73 -> (Text -> Text)
74 -> NgramsType
75 -> Cmd err (Map NgramsType [NgramsElement])
76 buildNgramsOthersList uCid groupIt nt = do
77 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid nt
78
79 let
80 listSize = 9
81 all' = List.reverse $ List.sortOn (Set.size . snd . snd) $ Map.toList ngs
82
83 graphTerms = List.take listSize all'
84 candiTerms = List.drop listSize all'
85
86 pure $ Map.unionsWith (<>) [ toElements MapTerm graphTerms
87 , toElements CandidateTerm candiTerms
88 ]
89 where
90 toElements nType x =
91 Map.fromList [(nt, [ mkNgramsElement (NgramsTerm t) nType Nothing (mSetFromList [])
92 | (t, _ns) <- x
93 ]
94 )]
95
96 buildNgramsTermsList :: Lang
97 -> Int
98 -> Int
99 -> StopSize
100 -> UserCorpusId
101 -> MasterCorpusId
102 -> Cmd err (Map NgramsType [NgramsElement])
103 buildNgramsTermsList l n m s uCid mCid = do
104 candidates <- sortTficf Up <$> getTficf uCid mCid NgramsTerms
105 -- printDebug "head candidates" (List.take 10 $ candidates)
106 -- printDebug "tail candidates" (List.take 10 $ List.reverse $ candidates)
107
108 let
109 (candidatesHead, candidatesTail0) = List.splitAt 3 candidates
110 (candidatesMap, candidatesTailFinal) = List.splitAt 400 candidatesTail0
111
112 termList = (map (toGargList ((isStopTerm s) . fst) CandidateTerm) candidatesHead)
113 <> (map (toGargList ((isStopTerm s) . fst) MapTerm) candidatesMap)
114 <> (map (toGargList ((isStopTerm s) . fst) CandidateTerm) candidatesTailFinal)
115
116 ngs = List.concat
117 $ map toNgramsElement
118 $ groupStems
119 $ map (\(listType, (t,d)) -> ( ngramsGroup l n m t
120 , GroupedText listType t d Set.empty
121 )
122 ) termList
123
124 pure $ Map.fromList [(NgramsTerms, ngs)]
125
126 type Group = Lang -> Int -> Int -> Text -> Text
127 type Stem = Text
128 type Label = Text
129 data GroupedText = GroupedText { _gt_listType :: ListType
130 , _gt_label :: Label
131 , _gt_score :: Double
132 , _gt_group :: Set Text
133 }
134 groupStems :: [(Stem, GroupedText)] -> [GroupedText]
135 groupStems = Map.elems . Map.fromListWith grouping
136 where
137 grouping (GroupedText lt1 label1 score1 group1)
138 (GroupedText lt2 label2 score2 group2)
139 | score1 >= score2 = GroupedText lt label1 score1 (Set.insert label2 gr)
140 | otherwise = GroupedText lt label2 score2 (Set.insert label1 gr)
141 where
142 lt = lt1 <> lt2
143 gr = Set.union group1 group2
144
145 toNgramsElement :: GroupedText -> [NgramsElement]
146 toNgramsElement (GroupedText listType label _ setNgrams) =
147 [parentElem] <> childrenElems
148 where
149 parent = label
150 children = Set.toList setNgrams
151 parentElem = mkNgramsElement (NgramsTerm parent)
152 listType
153 Nothing
154 (mSetFromList (NgramsTerm <$> children))
155 childrenElems = map (\t -> mkNgramsElement t listType
156 (Just $ RootParent (NgramsTerm parent) (NgramsTerm parent))
157 (mSetFromList [])
158 ) (NgramsTerm <$> children)
159
160
161 toGargList :: (b -> Bool) -> ListType -> b -> (ListType, b)
162 toGargList isStop l n = case isStop n of
163 True -> (StopTerm, n)
164 False -> (l, n)
165
166
167 isStopTerm :: StopSize -> Text -> Bool
168 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
169 where
170 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)