1 module Data.Gargantext.RCT where
5 --import Data.Text (Text, words)
6 --import Data.Attoparsec.Text (anyChar, isEndOfLine, Parser, takeTill, many1, endOfLine, space, manyTill)
7 --import Control.Applicative (many)
9 -- RCT is the acronym for Referential ConText (of Text)
10 -- at the begin there was a byte
14 -- then a list of chars called a string, we call it a Form
15 -- (removing all weird charachters which are not alphanumeric)
17 -- Form -> RCT Sentence
19 -- These forms compose the RCT Sentence
20 -- an ngrams is composed with multiple forms
22 -- Paragraph = [Sentence]
24 -- type Title = Paragraph
25 -- data Block = [Paragraph]
26 -- Block is taken form Pandoc
28 -- data Document = [Block]
34 -- Paragraph (abstract + title)
35 -- Sentence - Ngrams - Forms
39 --separateurs :: Parser Text
40 --separateurs = dropWhile isEndOfLine
42 --paragraphs :: Parser [Text]
43 --paragraphs = many paragraph
45 --paragraph :: Parser Text
46 --paragraph = takeTill isEndOfLine <* many1 endOfLine
48 -- forms :: Text -> [Text]