1defmodule CSV.Decoding.Lexer do 2 use CSV.Defaults 3 alias CSV.EncodingError 4 5 @moduledoc ~S""" 6 RFC 4180 compatible CSV lexer. Lexes tokens and sends them to the parser 7 process. 8 """ 9 10 @doc """ 11 Lexes strings received from a sender (the decoder) and sends the resulting 12 tokens to the parser process / the receiver. 13 14 ## Options 15 16 Options get transferred from the decoder. They are: 17 18 * `:separator` – The separator token to use, defaults to `?,`. Must be a 19 codepoint. 20 21 * `:replacement` – The replacement string to use where lines have bad 22 encoding. Defaults to `nil`, which disables replacement. 23 """ 24 25 def lex({ line, index }, options \\ []) when is_list(options) do 26 separator = options |> Keyword.get(:separator, @separator) 27 replacement = options |> Keyword.get(:replacement, @replacement) 28 29 case String.valid?(line) do 30 false -> 31 if replacement do 32 replace_bad_encoding(line, replacement) |> lex(index, separator) 33 else 34 { :error, EncodingError, "Invalid encoding", index } 35 end 36 true -> lex(line, index, separator) 37 end 38 end 39 40 defp lex(line, index, separator) do 41 case lex([], nil, line, separator) do 42 { :ok, tokens } -> { :ok, tokens, index } 43 end 44 end 45 46 defp lex(tokens, { :delimiter, value }, << @newline :: utf8 >> <> tail, separator) do 47 lex(tokens, { :delimiter, value <> << @newline :: utf8 >> }, tail, separator) 48 end 49 defp lex(tokens, current_token, << @newline :: utf8 >> <> tail, separator) do 50 lex(tokens |> add_token(current_token), { :delimiter, << @newline :: utf8 >> }, tail, separator) 51 end 52 defp lex(tokens, current_token, << @carriage_return :: utf8 >> <> tail, separator) do 53 lex(tokens |> add_token(current_token), { :delimiter, << @carriage_return :: utf8 >> }, tail, separator) 54 end 55 defp lex(tokens, current_token, << @double_quote :: utf8 >> <> tail, separator) do 56 lex(tokens |> add_token(current_token), { :double_quote, << @double_quote :: utf8 >> }, tail, separator) 57 end 58 defp lex(tokens, current_token, << head :: utf8 >> <> tail, separator) when head == separator do 59 lex(tokens |> add_token(current_token), { :separator, << separator :: utf8 >> }, tail, separator) 60 end 61 defp lex(tokens, { :content, value }, << head :: utf8 >> <> tail, separator) do 62 lex(tokens, { :content, value <> << head :: utf8 >> }, tail, separator) 63 end 64 defp lex(tokens, nil, << head :: utf8 >> <> tail, separator) do 65 lex(tokens, { :content, << head :: utf8 >> }, tail, separator) 66 end 67 defp lex(tokens, current_token, << head :: utf8 >> <> tail, separator) do 68 lex(tokens |> add_token(current_token), { :content, << head :: utf8 >> }, tail, separator) 69 end 70 defp lex(tokens, current_token, "", _) do 71 { :ok, tokens |> add_token(current_token) } 72 end 73 74 defp add_token(tokens, nil) do 75 tokens 76 end 77 defp add_token(tokens, token) do 78 tokens ++ [token] 79 end 80 81 defp replace_bad_encoding(line, replacement) do 82 line 83 |> String.codepoints 84 |> Enum.map(fn codepoint -> if String.valid?(codepoint), do: codepoint, else: replacement end) 85 |> Enum.join 86 end 87end 88