Question

For the life of me I cannot deserialize the protobuf file from Open Street Maps.

I am trying to deserialize the following extract: http://download.geofabrik.de/osm/north-america/us-northeast.osm.pbf to get Nodes and I am using http://code.google.com/p/protobuf-net/ as the library. I have tried to deserialize a bunch of different objects but they all come up null.

The proto files can be found here: http://trac.openstreetmap.org/browser/applications/utils/export/osm2pgsql/protobuf

Any suggestions?

Was it helpful?

Solution

Right; the problem is that this isn't just protobuf - it is a hybrid file format (defined here that includes protobuf among various formats internally. It also incorporates compression (although that looks to be optional).

I've pulled apart what I can from the spec, and I've got a C# reader here that uses protobuf-net to handle the chunks - it happily reads through that file to the end - I can tell you there are 4515 blocks (BlockHeader). When it gets to the Blob I'm a bit confused as to how the spec demarks OSMHeader and OSMData - I'm open to suggestions here! I've also used ZLIB.NET to handle the zlib compression that is being used. In the absence of getting my head around this, I've settled for processing the ZLIB data and validating it against the claimed size, to check it is at least sane.

If you can figure out (or ask the author) how they are separating OSMHeader and OSMData I'll happily crank something else in. I hope you don't mind that I've stopped here - but it has been a few hours ;p

using System;
using System.IO;
using OpenStreetMap; // where my .proto-generated entities are living
using ProtoBuf; // protobuf-net
using zlib; // ZLIB.NET    

class OpenStreetMapParser
{

    static void Main()
    {
        using (var file = File.OpenRead("us-northeast.osm.pbf"))
        {
            // from http://wiki.openstreetmap.org/wiki/ProtocolBufBinary:
            //A file contains a header followed by a sequence of fileblocks. The design is intended to allow future random-access to the contents of the file and skipping past not-understood or unwanted data.
            //The format is a repeating sequence of:
            //int4: length of the BlockHeader message in network byte order
            //serialized BlockHeader message
            //serialized Blob message (size is given in the header)

            int length, blockCount = 0;
            while (Serializer.TryReadLengthPrefix(file, PrefixStyle.Fixed32, out length))
            {
                // I'm just being lazy and re-using something "close enough" here
                // note that v2 has a big-endian option, but Fixed32 assumes little-endian - we
                // actually need the other way around (network byte order):
                uint len = (uint)length;
                len = ((len & 0xFF) << 24) | ((len & 0xFF00) << 8) | ((len & 0xFF0000) >> 8) | ((len & 0xFF000000) >> 24);
                length = (int)len;

                BlockHeader header;
                // again, v2 has capped-streams built in, but I'm deliberately
                // limiting myself to v1 features
                using (var tmp = new LimitedStream(file, length))
                {
                    header = Serializer.Deserialize<BlockHeader>(tmp);
                }
                Blob blob;
                using (var tmp = new LimitedStream(file, header.datasize))
                {
                    blob = Serializer.Deserialize<Blob>(tmp);
                }
                if(blob.zlib_data == null) throw new NotSupportedException("I'm only handling zlib here!");

                using(var ms = new MemoryStream(blob.zlib_data))
                using(var zlib = new ZLibStream(ms))
                { // at this point I'm very unclear how the OSMHeader and OSMData are packed - it isn't clear
                    // read this to the end, to check we can parse the zlib
                    int payloadLen = 0;
                    while (zlib.ReadByte() >= 0) payloadLen++;
                    if (payloadLen != blob.raw_size) throw new FormatException("Screwed that up...");
                }
                blockCount++;
                Console.WriteLine("Read block " + blockCount.ToString());


            }
            Console.WriteLine("all done");
            Console.ReadLine();
        }
    }
}
abstract class InputStream : Stream
{
    protected abstract int ReadNextBlock(byte[] buffer, int offset, int count);
    public sealed override int Read(byte[] buffer, int offset, int count)
    {
        int bytesRead, totalRead = 0;
        while (count > 0 && (bytesRead = ReadNextBlock(buffer, offset, count)) > 0)
        {
            count -= bytesRead;
            offset += bytesRead;
            totalRead += bytesRead;
            pos += bytesRead;
        }
        return totalRead;
    }
    long pos;
    public override void Write(byte[] buffer, int offset, int count)
    {
        throw new NotImplementedException();
    }
    public override void SetLength(long value)
    {
        throw new NotImplementedException();
    }
    public override long Position
    {
        get
        {
            return pos;
        }
        set
        {
            if (pos != value) throw new NotImplementedException();
        }
    }
    public override long Length
    {
        get { throw new NotImplementedException(); }
    }
    public override void Flush()
    {
        throw new NotImplementedException();
    }
    public override bool CanWrite
    {
        get { return false; }
    }
    public override bool CanRead
    {
        get { return true; }
    }
    public override bool CanSeek
    {
        get { return false; }
    }
    public override long Seek(long offset, SeekOrigin origin)
    {
        throw new NotImplementedException();
    }
}
class ZLibStream : InputStream
{   // uses ZLIB.NET: http://www.componentace.com/download/download.php?editionid=25
    private ZInputStream reader; // seriously, why isn't this a stream?
    public ZLibStream(Stream stream)
    {
        reader = new ZInputStream(stream);
    }
    public override void Close()
    {
        reader.Close();
        base.Close();
    }
    protected override int ReadNextBlock(byte[] buffer, int offset, int count)
    {
        // OMG! reader.Read is the base-stream, reader.read is decompressed! yeuch
        return reader.read(buffer, offset, count);
    }

}
// deliberately doesn't dispose the base-stream    
class LimitedStream : InputStream
{
    private Stream stream;
    private long remaining;
    public LimitedStream(Stream stream, long length)
    {
        if (length < 0) throw new ArgumentOutOfRangeException("length");
        if (stream == null) throw new ArgumentNullException("stream");
        if (!stream.CanRead) throw new ArgumentException("stream");
        this.stream = stream;
        this.remaining = length;
    }
    protected override int ReadNextBlock(byte[] buffer, int offset, int count)
    {
        if(count > remaining) count = (int)remaining;
        int bytesRead = stream.Read(buffer, offset, count);
        if (bytesRead > 0) remaining -= bytesRead;
        return bytesRead;
    }
}

OTHER TIPS

After the outline setup by Mark I figured out the last part by looking at http://git.openstreetmap.nl/index.cgi/pbf2osm.git/tree/src/main.c?h=35116112eb0066c7729a963b292faa608ddc8ad7

Here is the final code.

using System;
using System.Diagnostics;
using System.IO;
using crosby.binary;
using OSMPBF;
using PerlLLC.Tools;
using ProtoBuf;
using zlib;

namespace OpenStreetMapOperations
{
    class OpenStreetMapParser
    {
        static void Main()
        {
            using (var file = File.OpenRead(StaticTools.AssemblyDirectory + @"\us-pacific.osm.pbf"))
            {
                // from http://wiki.openstreetmap.org/wiki/ProtocolBufBinary:
                //A file contains a header followed by a sequence of fileblocks. The design is intended to allow future random-access to the contents of the file and skipping past not-understood or unwanted data.
                //The format is a repeating sequence of:
                //int4: length of the BlockHeader message in network byte order
                //serialized BlockHeader message
                //serialized Blob message (size is given in the header)

                int length, blockCount = 0;
                while (Serializer.TryReadLengthPrefix(file, PrefixStyle.Fixed32, out length))
                {
                    // I'm just being lazy and re-using something "close enough" here
                    // note that v2 has a big-endian option, but Fixed32 assumes little-endian - we
                    // actually need the other way around (network byte order):
                    length = IntLittleEndianToBigEndian((uint)length);

                    BlockHeader header;
                    // again, v2 has capped-streams built in, but I'm deliberately
                    // limiting myself to v1 features
                    using (var tmp = new LimitedStream(file, length))
                    {
                        header = Serializer.Deserialize<BlockHeader>(tmp);
                    }
                    Blob blob;
                    using (var tmp = new LimitedStream(file, header.datasize))
                    {
                        blob = Serializer.Deserialize<Blob>(tmp);
                    }
                    if (blob.zlib_data == null) throw new NotSupportedException("I'm only handling zlib here!");

                    HeaderBlock headerBlock;
                    PrimitiveBlock primitiveBlock;

                    using (var ms = new MemoryStream(blob.zlib_data))
                    using (var zlib = new ZLibStream(ms))
                    {
                        if (header.type == "OSMHeader")
                            headerBlock = Serializer.Deserialize<HeaderBlock>(zlib);

                        if (header.type == "OSMData")
                            primitiveBlock = Serializer.Deserialize<PrimitiveBlock>(zlib);
                    }
                    blockCount++;
                    Trace.WriteLine("Read block " + blockCount.ToString());


                }
                Trace.WriteLine("all done");
            }
        }

        // 4-byte number
        static int IntLittleEndianToBigEndian(uint i)
        {
            return (int)(((i & 0xff) << 24) + ((i & 0xff00) << 8) + ((i & 0xff0000) >> 8) + ((i >> 24) & 0xff));
        }
    }

    abstract class InputStream : Stream
    {
        protected abstract int ReadNextBlock(byte[] buffer, int offset, int count);
        public sealed override int Read(byte[] buffer, int offset, int count)
        {
            int bytesRead, totalRead = 0;
            while (count > 0 && (bytesRead = ReadNextBlock(buffer, offset, count)) > 0)
            {
                count -= bytesRead;
                offset += bytesRead;
                totalRead += bytesRead;
                pos += bytesRead;
            }
            return totalRead;
        }
        long pos;
        public override void Write(byte[] buffer, int offset, int count)
        {
            throw new NotImplementedException();
        }
        public override void SetLength(long value)
        {
            throw new NotImplementedException();
        }
        public override long Position
        {
            get
            {
                return pos;
            }
            set
            {
                if (pos != value) throw new NotImplementedException();
            }
        }
        public override long Length
        {
            get { throw new NotImplementedException(); }
        }
        public override void Flush()
        {
            throw new NotImplementedException();
        }
        public override bool CanWrite
        {
            get { return false; }
        }
        public override bool CanRead
        {
            get { return true; }
        }
        public override bool CanSeek
        {
            get { return false; }
        }
        public override long Seek(long offset, SeekOrigin origin)
        {
            throw new NotImplementedException();
        }
    }
    class ZLibStream : InputStream
    {   // uses ZLIB.NET: http://www.componentace.com/download/download.php?editionid=25
        private ZInputStream reader; // seriously, why isn't this a stream?
        public ZLibStream(Stream stream)
        {
            reader = new ZInputStream(stream);
        }
        public override void Close()
        {
            reader.Close();
            base.Close();
        }
        protected override int ReadNextBlock(byte[] buffer, int offset, int count)
        {
            // OMG! reader.Read is the base-stream, reader.read is decompressed! yeuch
            return reader.read(buffer, offset, count);
        }

    }
    // deliberately doesn't dispose the base-stream    
    class LimitedStream : InputStream
    {
        private Stream stream;
        private long remaining;
        public LimitedStream(Stream stream, long length)
        {
            if (length < 0) throw new ArgumentOutOfRangeException("length");
            if (stream == null) throw new ArgumentNullException("stream");
            if (!stream.CanRead) throw new ArgumentException("stream");
            this.stream = stream;
            this.remaining = length;
        }
        protected override int ReadNextBlock(byte[] buffer, int offset, int count)
        {
            if (count > remaining) count = (int)remaining;
            int bytesRead = stream.Read(buffer, offset, count);
            if (bytesRead > 0) remaining -= bytesRead;
            return bytesRead;
        }
    }
}

Yes, it came from protogen in Fileformat.cs (based on OSM Fileformat.proto file.. code below.)

package OSM_PROTO;
  message Blob {
    optional bytes raw = 1;
    optional int32 raw_size = 2; 
    optional bytes zlib_data = 3;
    optional bytes lzma_data = 4;
    optional bytes bzip2_data = 5;
  }

  message BlockHeader {
    required string type = 1;
    optional bytes indexdata = 2;
    required int32 datasize = 3;
  }

Here is the declaration of BlockHeader in generated file :

public sealed partial class BlockHeader : pb::GeneratedMessage<BlockHeader, BlockHeader.Builder> {...}

-> using pb = global::Google.ProtocolBuffers;

(ProtocolBuffers.dll) came with this package :

http://code.google.com/p/protobuf-csharp-port/downloads/detail?name=protobuf-csharp-port-2.4.1.473-full-binaries.zip&can=2&q=

Have you tried to get some smaller area? such as us-pacific.osm.pbf

Eventually it would be useful to post the error-messages.

Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top