1defmodule CSV.Decoding.Lexer do
2  use CSV.Defaults
3  alias CSV.EncodingError
4
5  @moduledoc ~S"""
6  RFC 4180 compatible CSV lexer. Lexes tokens and sends them to the parser
7  process.
8  """
9
10  @doc """
11  Lexes strings received from a sender (the decoder) and sends the resulting
12  tokens to the parser process / the receiver.
13
14  ## Options
15
16  Options get transferred from the decoder. They are:
17
18    * `:separator`   – The separator token to use, defaults to `?,`. Must be a
19      codepoint.
20
21    * `:replacement`    – The replacement string to use where lines have bad
22      encoding. Defaults to `nil`, which disables replacement.
23  """
24
25  def lex({ line, index }, options \\ []) when is_list(options) do
26    separator = options |> Keyword.get(:separator, @separator)
27    replacement = options |> Keyword.get(:replacement, @replacement)
28
29    case String.valid?(line) do
30      false ->
31        if replacement do
32          replace_bad_encoding(line, replacement) |> lex(index, separator)
33        else
34          { :error, EncodingError, "Invalid encoding", index }
35        end
36      true -> lex(line, index, separator)
37    end
38  end
39
40  defp lex(line, index, separator) do
41    case lex([], nil, line, separator) do
42      { :ok, tokens } -> { :ok, tokens, index }
43    end
44  end
45
46  defp lex(tokens, { :delimiter, value }, << @newline :: utf8 >> <> tail, separator) do
47    lex(tokens, { :delimiter, value <> << @newline :: utf8 >> }, tail, separator)
48  end
49  defp lex(tokens, current_token, << @newline :: utf8 >> <> tail, separator) do
50    lex(tokens |> add_token(current_token), { :delimiter, << @newline :: utf8 >> }, tail, separator)
51  end
52  defp lex(tokens, current_token, << @carriage_return :: utf8 >> <> tail, separator) do
53    lex(tokens |> add_token(current_token), { :delimiter, << @carriage_return :: utf8 >> }, tail, separator)
54  end
55  defp lex(tokens, current_token, << @double_quote :: utf8 >> <> tail, separator) do
56    lex(tokens |> add_token(current_token), { :double_quote, << @double_quote :: utf8 >> }, tail, separator)
57  end
58  defp lex(tokens, current_token, << head :: utf8 >> <> tail, separator) when head == separator do
59    lex(tokens |> add_token(current_token), { :separator, << separator :: utf8 >> }, tail, separator)
60  end
61  defp lex(tokens, { :content, value }, << head :: utf8 >> <> tail, separator) do
62    lex(tokens, { :content, value <> << head :: utf8 >> }, tail, separator)
63  end
64  defp lex(tokens, nil, << head :: utf8 >> <> tail, separator) do
65    lex(tokens, { :content, << head :: utf8 >> }, tail, separator)
66  end
67  defp lex(tokens, current_token, << head :: utf8 >> <> tail, separator) do
68    lex(tokens |> add_token(current_token), { :content, << head :: utf8 >> }, tail, separator)
69  end
70  defp lex(tokens, current_token, "", _) do
71    { :ok, tokens |> add_token(current_token) }
72  end
73
74  defp add_token(tokens, nil) do
75    tokens
76  end
77  defp add_token(tokens, token) do
78    tokens ++ [token]
79  end
80
81  defp replace_bad_encoding(line, replacement) do
82    line
83    |> String.codepoints
84    |> Enum.map(fn codepoint -> if String.valid?(codepoint), do: codepoint, else: replacement end)
85    |> Enum.join
86  end
87end
88