Loading truly truly huge text files with a QAbstractListModel

Sometimes people want to do crazy stuff like loading a gigabyte sized plain text file into a Qt view that can handle QAbstractListModel. Like for example a QML ListView. You know, the kind of files you generate with this commando:

base64 /dev/urandom | head -c 100000000 > /tmp/file.txt

But, how do they do it?

FileModel.h

So we will make a custom QAbstractListModel. Its private member fields I will explain later:

#ifndef FILEMODEL_H
#define FILEMODEL_H

#include <QObject>
#include <QVariant>
#include <QAbstractListModel>
#include <QFile>

class FileModel: public QAbstractListModel {
    Q_OBJECT

    Q_PROPERTY(QString fileName READ fileName WRITE setFileName NOTIFY fileNameChanged )
public:
    explicit FileModel( QObject* a_parent = nullptr );
    virtual ~FileModel();

    int columnCount(const QModelIndex &parent) const;
    int rowCount( const QModelIndex& parent =  QModelIndex() ) const Q_DECL_OVERRIDE;
    QVariant data( const QModelIndex& index, int role = Qt::DisplayRole ) const  Q_DECL_OVERRIDE;
    QVariant headerData( int section, Qt::Orientation orientation,
                         int role = Qt::DisplayRole ) const  Q_DECL_OVERRIDE;
    void setFileName(const QString &fileName);
    QString fileName () const
        { return m_file->fileName(); }
signals:
    void fileNameChanged();
private:
    QFile *m_file, *m_index;
    uchar *map_file;
    uchar *map_index;
    int m_rowCount;
    void clear();
};

#endif// FILEMODEL_H

FileModel.cpp

We will basically scan the very big source text file for newline characters. We’ll write the offsets of those to a file suffixed with “.mmap”. We’ll use that new file as a sort of “partition table” for the very big source text file, in the data() function of QAbstractListModel. But instead of sectors and files, it points to newlines.

The reason why the scanner itself isn’t using the mmap’s address space is because apparently reading blocks of 4kb is faster than reading each and every byte from the mmap in search of \n characters. Or at least on my hardware it was.

You should probably do the scanning in small qEventLoop iterations (make sure to use nonblocking reads, then) or in a thread, as your very big source text file can be on a unreliable or slow I/O device. Plus it’s very big, else you wouldn’t be doing this (please promise me to just read the entire text file in memory unless it’s hundreds of megabytes in size: don’t micro optimize your silly homework notepad.exe clone).

Note that this is demo code with a lot of bugs like not checking for \r and god knows what memory leaks and stuff was remaining when it suddenly worked. I leave it to the reader to improve this. An example is that you should check for validity of the “.mmap” file: your very big source text file might have changed since the newline partition table was made.

Knowing that I’ll soon find this all over the place without any of its bugs fixed, here it comes ..

#include "FileModel.h"

#include <QDebug>

#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <pthread.h>
#include <unistd.h>

FileModel::FileModel( QObject* a_parent )
    : QAbstractListModel( a_parent )
    , m_file (nullptr)
    , m_index(nullptr)
    , m_rowCount ( 0 ) { }

FileModel::~FileModel() { clear(); }

void FileModel::clear()
{
    if (m_file) {
        if (m_file->isOpen() && map_file != nullptr)
            m_file->unmap(map_file);
        delete m_file;
    }
    if (m_index) {
        if (m_index->isOpen() && map_index != nullptr)
            m_index->unmap(map_index);
        delete m_index;
    }
}

void FileModel::setFileName(const QString &fileName)
{
   clear();
   m_rowCount = 0;
   m_file = new QFile(fileName);
   int cur = 0;
   m_index = new QFile(m_file->fileName() + ".mmap");
   if (m_file->open(QIODevice::ReadOnly)) {
       if (!m_index->exists()) {
           char rbuffer[4096];
           m_index->open(QIODevice::WriteOnly);
           char nulbuffer[4];
           int idxnul = 0;
           memset( nulbuffer +0, idxnul >> 24 & 0xff, 1 );
           memset( nulbuffer +1, idxnul >> 16 & 0xff, 1 );
           memset( nulbuffer +2, idxnul >>  8 & 0xff, 1 );
           memset( nulbuffer +3, idxnul >>  0 & 0xff, 1 );
           m_index->write( nulbuffer, sizeof(quint32));
           qDebug() << "Indexing to" << m_index->fileName();
           while (!m_file->atEnd()) {
               int in = m_file->read(rbuffer, 4096);
               if (in == -1)
                   break;
               char *newline = (char*) 1;
               char *last = rbuffer;
               while (newline != 0) {
                   newline = strchr ( last, '\n');
                   if (newline != 0) {
                     char buffer[4];
                     int idx = cur + (newline - rbuffer);
                     memset( buffer +0, idx >> 24 & 0xff, 1 );
                     memset( buffer +1, idx >> 16 & 0xff, 1 );
                     memset( buffer +2, idx >>  8 & 0xff, 1 );
                     memset( buffer +3, idx >>  0 & 0xff, 1 );
                     m_index->write( buffer, sizeof(quint32));
                     m_rowCount++;
                     last = newline + 1;
                  }
               }
               cur += in;
           }
           m_index->close();
           m_index->open(QFile::ReadOnly);
           qDebug() << "done";
       } else {
           m_index->open(QFile::ReadOnly);
           m_rowCount = m_index->size() / 4;
       }
       map_file= m_file->map(0, m_file->size(), QFileDevice::NoOptions);
       qDebug() << "Done loading " << m_rowCount << " lines";
       map_index = m_index->map(0, m_index->size(), QFileDevice::NoOptions);
   }
   beginResetModel();
   endResetModel();
   emit fileNameChanged();
}

static quint32
read_uint32 (const quint8 *data)
{
    return data[0] << 24 |
           data[1] << 16 |
           data[2] << 8 |
           data[3];
}

int FileModel::rowCount( const QModelIndex& parent ) const
{
    Q_UNUSED( parent );
    return m_rowCount;
}

int FileModel::columnCount(const QModelIndex &parent) const
{
    Q_UNUSED( parent );
    return 1;
}

QVariant FileModel::data( const QModelIndex& index, int role ) const
{
    if( !index.isValid() )
        return QVariant();
    if (role == Qt::DisplayRole) {
        QVariant ret;
        quint32 pos_i = read_uint32(map_index + ( 4 * index.row() ) );
        quint32 end_i;
        if ( index.row() == m_rowCount-1 )
            end_i = m_file->size();
        else
            end_i = read_uint32(map_index + ( 4 * (index.row()+1) ) );
        uchar *position;
        position = map_file +  pos_i;
        uchar *end = map_file + end_i;
        int length = end - position;
        char *buffer = (char*) alloca(length +1);
        memset (buffer, 0, length+1);
        strncpy (buffer, (char*) position, length);
        ret = QVariant(QString(buffer));
        return ret;
    }
    return QVariant();
}

QVariant FileModel::headerData( int section, Qt::Orientation orientation, int role ) const
{
    Q_UNUSED(section);
    Q_UNUSED(orientation);
    if (role != Qt::DisplayRole)
           return QVariant();
    return QString("header");
}

main.cpp

#include <QGuiApplication>
#include <QQmlApplicationEngine>
#include <QtQml>// qmlRegisterType

#include "FileModel.h"

int main(int argc, char *argv[])
{
    QGuiApplication app(argc, argv);
    qmlRegisterType<FileModel>( "FileModel", 1, 0, "FileModel" );
    QQmlApplicationEngine engine;
    engine.load(QUrl(QStringLiteral("qrc:/main.qml")));
    return app.exec();
}

main.qml

import QtQuick 2.3
import QtQuick.Window 2.2
import FileModel 1.0

Window {
    visible: true

    FileModel { id: fileModel }
    ListView {
        id: list
        anchors.fill: parent
        delegate: Text { text: display }
        MouseArea {
            anchors.fill: parent
            onClicked: {
                list.model = fileModel
                fileModel.fileName = "/tmp/file.txt"
            }
        }
    }
}

profile.pro

TEMPLATE = app
QT += qml quick
CONFIG += c++11
SOURCES += main.cpp \
    FileModel.cpp
RESOURCES += qml.qrc
HEADERS += \
    FileModel.h

qml.qrc

<RCC>
    <qresource prefix="/">
        <file>main.qml</file>
    </qresource>
</RCC>