Efficient parsing of JSON record sets in Rust

7/17/2020 • ☕️ 2 min read

JSON record set is an array of objects. serde_json crate does not provide fast solution to parse it out of the box, but you can do something I came up with. I use StreamDeserializer and custom handling to skip array symbols like ’[’ or ’,‘.

The code:

use crate::Error;
use async_std::io::SeekFrom;
use byteorder::ReadBytesExt;
use serde_json::{Deserializer, Map, Value};
use std::io::{Read, Seek};

type Record = Map<String, Value>;

pub struct JsonRecords<'de, R>
where
    R: Read + Seek,
{
    reader: &'de mut R,
    is_array_start: bool,
}

impl<'de, R> JsonRecords<'de, R>
where
    R: Read + Seek,
{
    #[inline]
    pub fn new(reader: &'de mut R) -> Self {
        Self {
            reader,
            is_array_start: true,
        }
    }

    #[inline]
    fn skip_array_start(&mut self) -> Result<(), Error> {
        self.skip_char(b'[')?;
        self.is_array_start = false;
        Ok(())
    }

    fn skip_char(&mut self, val: u8) -> Result<(), Error> {
        self.skip_ws()?;
        let c = self.reader.read_u8()?;
        if c != val {
            return Err(Error::Parse(format!(
                "expect '{}', but was '{}'",
                val, c
            )));
        }
        Ok(())
    }

    fn skip_ws(&mut self) -> Result<(), Error> {
        loop {
            let c = self.peek()?;
            match c {
                b' ' | b'\n' | b'\r' => {
                    self.discard();
                    continue;
                }
                _ => break,
            }
        }
        Ok(())
    }

    #[inline]
    fn peek(&mut self) -> Result<u8, Error> {
        let result = self.reader.read_u8()?;
        self.reader.seek(SeekFrom::Current(-1))?;
        Ok(result)
    }

    #[inline]
    fn discard(&mut self) {
        self.reader.read_u8().unwrap();
    }

    #[inline]
    fn read_record(&mut self) -> Option<Result<Record, Error>> {
        let de = Deserializer::from_reader(&mut self.reader);
        let mut stream = de.into_iter();
        match stream.next() {
            None => None,
            Some(t) => match t {
                Ok(v) => Some(Ok(v)),
                Err(e) => Some(Err(e.into())),
            },
        }
    }
}

impl<'de, R> Iterator for JsonRecords<'de, R>
where
    R: Read + Seek,
{
    type Item = Result<Record, Error>;

    fn next(&mut self) -> Option<Self::Item> {
        if self.is_array_start {
            match self.skip_array_start() {
                Err(e) => return Some(Err(e)),
                Ok(()) => {}
            }
        }
        match self.skip_ws() {
            Err(e) => return Some(Err(e)),
            Ok(()) => {}
        }
        loop {
            match self.peek() {
                Err(e) => return Some(Err(e)),
                Ok(c) => match c {
                    b']' => {
                        self.discard();
                        let _ = self.skip_ws();
                        return None;
                    }
                    b',' => {
                        self.discard();
                        self.skip_ws().unwrap();
                        continue;
                    }
                    b'{' => return self.read_record(),
                    _ => {
                        return Some(Err(Error::Parse(format!(
                            "expected '{{' or ']', but got '{}'",
                            c as char
                        ))))
                    }
                },
            }
        }
    }
}

#[cfg(test)]
mod tests {
    use crate::json_records::JsonRecords;
    use crate::Error;
    use std::io::Cursor;

    #[test]
    fn test_json_array() -> Result<(), Error> {
        let json = "[{\"a\": 1}, {\"a\": 2}]";
        let mut cursor = Cursor::new(json.as_bytes().to_vec());
        let array = JsonRecords::new(&mut cursor);
        for rec in array {
            println!("{:?}", rec?);
        }
        Ok(())
    }
}

This allows to avoid allocation of intermediate vector for a bit faster processing of large datasets.

Enjoy! EOF 😄