Markup Language Parser in C++

Question

I'm new to C++ and decided to write a parser for a markup language. I chose org-mode because it seemed relatively straightforward.

parsertest.cpp

#include <filesystem>
#include <fstream>
#include <iostream>
#include <regex>
#include <string>
#include <vector>

#include <nlohmann/json.hpp>

namespace fs = std::filesystem;
using json = nlohmann::json;

struct Flags {
  int v_nh{1}; // number of headings
  bool f_nh{false};
  bool f_p{false}; // print all
  std::string v_dir{""};
  bool f_dir{false};  // traverse directory
  bool f_json{false}; // json output
} f;

struct RegexStrings {
  std::smatch s;

  const std::regex headings_h{R"(^\** )"};
  const std::regex quote_g{R"(^\#\+begin_quote$)"};
  const std::regex quote_h{R"(^\#\+end_quote$)"};
  const std::regex src_f{R"(^\#\+begin_src .*)"};
  const std::regex src_g{R"(^\#\+begin_src)"};
  const std::regex src_h{R"(^\#\+end_src$)"};
  const std::regex url_h{R"(^https?:\/\/[[:graph:]]+)"};
  const std::regex timestamp_h{R"(^\[\d\d-\d\d-\d\d \d\d:\d\d:\d\d\]$)"};
} rs;

std::vector<std::string> getOrgFilesRecursively(std::string path) {
  std::vector<std::string> flist{""};
  fs::path cpath{path};
  for (const auto &p : fs::recursive_directory_iterator(cpath)) {
    if (!fs::is_directory(p)) {
      std::string pp = p.path();
      // pp.substr(1, -1); // not required, really.
      if (pp.substr(pp.length() - 3) == "org") {
        flist.emplace_back(pp);
      }
    }
  }
  return flist;
}

std::vector<std::string> parseArgs(int a, char *b[]) {
  if (strcmp(b[1], "-help") == 0) {
    std::cout << "Org-Mode parser\n"
              << "-nh: Number of Headings\n"
              << "-p: Print object stack\n"
              << "-d: Recurse directory\n"
              << "-j: dump json\n";
    exit(0);
  }
  std::vector<std::string> flist{""};
  for (int i = 1; i < a; i++) {
    if (b[i][0] != '-') {
      auto carg = std::string(b[i]);
      fs::path cpath{carg};
      if (carg.substr(carg.length() - 3) == "org") {
        if (!fs::exists(cpath)) {
          continue;
        }
        flist.emplace_back(carg);
      }
    } else {
      if (b[i][1] == 'n') {
        if (b[i][1] == 'h') {
          i++;
          auto carg = std::string(b[i]);
          f.v_nh = stoi(carg);
          f.f_nh = true;
        }
      }
      if (b[i][1] == 'p') {
        f.f_p = true;
      }
      if (b[i][1] == 'd') {
        i++;
        f.v_dir = std::string(b[i]);
        f.f_dir = true;
      }
      if (b[i][1] == 'j') {
        f.f_json = true;
      }
    }
  }
  if (f.f_dir)
    flist = getOrgFilesRecursively(f.v_dir);
  if (f.f_p && f.f_json)
    f.f_p = false; // You can view the json instead
  return flist;
}

struct ParseF {
  // 0: line number
  // 1: type
  //   -1: text
  //    0: heading
  //    1: quote open
  //    2: quote close
  //    3: src open
  //    4: src close
  //    5: url
  //    6: timestamp
  // 2: data
  // 3: data
  // 4: filename
  std::vector<std::tuple<int, int, std::string, int, std::string>> cdb{};

  std::string cline{""};
  std::string cfilename{""};
  int clnum{0};

  json jo;

  void printAll();
  void printLastInsert();

  void collateAll(int, std::string, int);

  bool checkForHeadings();
  bool checkForQUOTE();
  bool checkForSRC();
  bool checkForURL();
  bool checkForTIMESTAMP();
};

void ParseF::printAll() {
  std::cout << "line, type, data, data, filename\n";
  for (auto &i : cdb) {
    std::cout << std::get<0>(i) << ", " << std::get<1>(i) << ", "
              << std::get<2>(i) << ", " << std::get<3>(i) << ", "
              << std::get<4>(i) << "\n";
  }
}

void ParseF::printLastInsert() {
  auto ctup = cdb.back();
  std::cout << std::get<0>(ctup) << ", " << std::get<1>(ctup) << ", "
            << std::get<2>(ctup) << ", " << std::get<3>(ctup) << ", "
            << std::get<4>(ctup) << "\n";
}

void ParseF::collateAll(int ttype, std::string sdata, int idata) {
  if (f.f_json) {
    jo[cfilename][std::to_string(clnum)][std::to_string(ttype)][sdata];
    jo[cfilename][std::to_string(clnum)][std::to_string(ttype)]
      [std::to_string(idata)];
  } else {
    cdb.emplace_back(std::make_tuple(clnum, ttype, sdata, idata, cfilename));
  }
}

bool ParseF::checkForHeadings() {
  // does not need full regexes
  if (std::regex_search(cline, rs.s, rs.headings_h)) {
    collateAll(0, cline.substr(rs.s.length()), rs.s.length() - 1);
    return true;
  } else
    return false;
}

bool ParseF::checkForQUOTE() {
  if (std::regex_search(cline, rs.s, rs.quote_g)) {
    collateAll(1, "\0", -1);
    return true;
  } else if (std::regex_search(cline, rs.s, rs.quote_h)) {
    collateAll(2, "\0", -1);
    return true;
  } else
    return false;
}

bool ParseF::checkForSRC() {
  if (std::regex_search(cline, rs.s, rs.src_g)) {
    int ss = rs.s.length() + 1;
    if (std::regex_search(cline, rs.s, rs.src_f)) {
      collateAll(3, cline.substr(ss), -1);
    } else {
      collateAll(3, "\0", -1);
    }
    return true;
  } else if (std::regex_search(cline, rs.s, rs.src_h)) {
    collateAll(4, "\0", -1);
    return true;
  } else
    return false;
}

bool ParseF::checkForURL() {
  if (std::regex_search(cline, rs.s, rs.url_h)) {
    collateAll(5, "\0", -1);
    return true;
  } else
    return false;
}

bool ParseF::checkForTIMESTAMP() {
  if (std::regex_search(cline, rs.s, rs.timestamp_h)) {
    collateAll(6, "\0", -1);
    return true;
  } else
    return false;
}

int main(int argc, char *argv[]) {
  std::vector<std::string> flist = parseArgs(argc, argv);
  ParseF x;
  for (int i = 1; i < flist.size(); i++) {
    std::ifstream cfile{flist.at(i)};
    std::string cline{""};
    int clnum{0};
    x.cfilename = flist.at(i);
    while (std::getline(cfile, cline) && f.v_nh > 0) {
      x.cline = cline;
      x.clnum = clnum;
      if (x.checkForHeadings()) {
        if (f.f_nh)
          f.v_nh--;
      } else if (x.checkForQUOTE()) {

      } else if (x.checkForSRC()) {

      } else if (x.checkForURL()) {

      } else if (x.checkForTIMESTAMP()) {

      } else {
        x.collateAll(-1, cline, -1);
      }
      clnum++;
    }
    cfile.close();
  }

  if (f.f_p)
    x.printAll();

  if (f.f_json) {
    std::ofstream o("dump.json");
    o << std::setw(4) << x.jo
      << std::endl; // setw is overloaded to set pretty printing
  }

  return 0;
}

Sample org-mode files:

https://github.com/emacs-straight/org-mode/raw/main/doc/org-manual.org

https://github.com/emacs-straight/org-mode/raw/main/doc/org-guide.org

compile: (requires https://github.com/nlohmann/json)

g++ parsertest.cpp -std=c++17 -Lnlohmann/json -o parsertest

usage:

./parsertest -p file.org

~/C++/orgparser$ time ./parsertest -d ~/bigorgdirectory/ -j

real    0m39.509s
user    0m38.902s
sys 0m0.496s
~/C++/orgparser$ fd -t f -e org . ~/bigorgdirectory/ -x wc -l | awk '{ sum += $1 } END { print sum }'
666342
~/C++/orgparser$ du -sh ~/bigorgdirectory/
86M /home/user/bigorgdirectory

Right now, this parses ~2 MB per second. I suppose that is slow. How can I speed it up? Should I not use regular expressions? Please point out any noob-tier mistakes I might have made.

\$\begingroup\$ Regular Expressions: Now You Have Two Problems \$\endgroup\$

Loki Astari
– Loki Astari

2022-04-28 18:27:22 +00:00
Commented Apr 28, 2022 at 18:27 — Loki Astari
– Loki Astari, Commented Apr 28, 2022 at 18:27
\$\begingroup\$ Thank you, that was a good blog post. \$\endgroup\$

precompute
– precompute

2022-04-30 19:24:16 +00:00
Commented Apr 30, 2022 at 19:24 — precompute
– precompute, Commented Apr 30, 2022 at 19:24

G. Sliepen · Accepted Answer · 2022-04-28 20:03:41Z

Naming things

The first thing that strikes me about your code is that you have a lot of variable and struct names that are unnecessarily abbreviated. While it saves some typing in the beginning, it makes it harder for others to read your code, including your future self in just half a year, because you will have forgotten what exactly every abbreviation meant.

While overly long names are not helpful either, here are some general rules of thumb:

Don't use one-character names unless it has a very commonly used meaning, like i for an integer index, or x/y/z for coordinates.
Short names are OK if they are completely self-explanatory or if their meaning is clear from the immediate context.
Some abbreviations are fine if they are commonly used, like str to denote a string, namespace fs is another one that is fine.

If I just look at some of the names in your code, I have questions like:

ParseF: is this a parser? Or does it hold parsing results? What does the F mean?
cdb, is that a database of something? If so, what is the c?
jo, if I didn't see that it's type is json I would have no clue that this probably means "JSON object", although even that is not very helpful.

Your function names are all helpfully descriptive.

Avoid using global variables

Global variables can sometimes be necessary, but they are problematic in larger projects and in libraries, as they pollute the global namespace. There are several ways to avoid these issues:

Make them static, so they will only be visible to the source file they are in, and not pollute the global namespace of symbols during linking.
Even better, if they are only used inside one class or struct, move them into that class/struct. If they are only used within one function, move them into that function.
If they are shared between classes/functions, you might declare them in one and pass them to the other via function arguments.

rs is only used within member functions of ParseF, so just make it a member variable of ParseF.

f is mostly used in parseArgs() and main(), and f.f_json is used inside ParseF::collateAll(). I would add a member variable flist to Flags, so parseArgs() can return both the parsed flags and the list of files in the return value. I would also add a f_json member variable to ParseF, and add a constructor that sets that variable. This way, after parsing the command line options, main() can pass the value of f.f_json to x.

Prefer creating a new `struct` instead of a `tuple` type

cdb is a vector of std::tuple<int, int, std::string, int, std::string>. When I look at that type, it basically tells me nothing except it's something that consists of three integers and two strings. What does each of the components mean? Will you remember the correct order of things? While std::tuple and std::pair have their uses, it's much better to instead define a struct where possible, as a struct will have a name of itself, and each of the components of that struct will have a name, thereby documenting itself. You also don't have to bother with std::make_tuple and std::get. So:

struct Entry {
    int line_number;
    int type;
    std::string string_data;
    int int_data; // or string_length?
    std::string filename;
};

std::vector<Entry> cdb;

To print entries, you now write:

for (const auto &entry: cdb) {
    std::cout << entry.line_number << ", " << entry.type << ", " << ...;
}

And to add a new one:

cdb.emplace_back(clnum, ttype, sdata, idata, cfilename);

Restructure the code

ParseF has some functions that help parse a single line, but the main parsing loop is in main(). It would be much nicer to have a parseFile() member function in ParseF (ooh, was that the F in the name of that struct?).

void ParseF::parseFile(const std::string &filename) {
    std::ifstream file(filename);
    std::string line;

    while (std::getline(file, line) && ...) {
        ...
    }
}

This moves a lot of the parsing responsibilities out of main(), it now just has to loop over the filenames:

int main(int argc, char *argv[]) {
    ...
    for (const auto &filename: flist) {
        x.parseFile(filename);
    }
    ...
}

Another benefit is that all the helper functions (collateAll(), checkFor*()) can now be made private.

Error handling

Your code ignores all errors. Consider that some files passed on the command line might not exist, or you don't have permissions to read, or there is a read error halfway reading the file. The user might have passed incorrect flags. Some files might not be valid org-mode files. The problem with ignoring these issues is that the program will just happily run without printing any warning and returning with an exit code of zero, and no-one is the wiser that the resulting dump.json file contains incorrect data.

Make sure you check that files are read correctly by checking that cfile.eof() == true after the while-loop that reads in all the lines. Check that the output was written correctly by calling o.close() and then checking that o.good() == true. If anything is wrong, write an error message to std::cerr and make sure you return a non-zero exit code, preferrably EXIT_FAILURE.

Preferrably, you should also validate that the input files are correct.

Increasing performance

Right now, this parses ~2 MB per second. I suppose that is slow. How can I speed it up? Should I not use regular expressions?

Martin York is correct here. Regular expressions are a useful tool, but beware of using the wrong tool for a given problem. Regular expressions can be fast, but usually only when they can process a large amount of data at a time. If you process only a line at a time, have to check this against 8 different regular expressions, and only one of them can match succesfully at a time anyway, you are going to waste a lot of CPU time.

In your case, I think the quickest is to create a hand-written decision tree to classify each line. A lot can be learned from just looking at the first character:

If it starts with *, it's a heading
If it starts with #, it's a begin or end clause
If it starts with h, it might be a URL
If it starts with a digit, it might be a timestamp.

You could write this as a series of if-statements, or perhaps also using a switch-statement.

Thank you, your advice was very useful. I improved my program, and it's much more readable (at least to me, anyway) and ~65x faster. I have not (yet) implemented everything, but I plan to. Here's what I have right now: gist.github.com/t-e-r-m/626b94c7f8e29eece5e34bafe1c0232c Thanks again! — precompute
– precompute, Commented Apr 30, 2022 at 19:23
Great! Feel free to post your revised code as a new question on CodeReview. — G. Sliepen
– G. Sliepen, Commented Apr 30, 2022 at 19:59

user258099 · Accepted Answer · 2022-04-29 03:59:02Z

0

You can get significant performance boosts by having the compiler optimize your code. Try -O 3 (please check the g++ man page for the compiler flags).

answered Apr 29, 2022 at 3:59

user258099

1

1

\$\begingroup\$ While your answer is correct as far as it goes, it doesn't make any insightful observations about the code, which is what we really do here on Code Review. Please read How do I write a good answer?. \$\endgroup\$

pacmaninbw
– pacmaninbw ♦

2022-04-29 14:38:12 +00:00
Commented Apr 29, 2022 at 14:38

Add a comment |

Stack Exchange Network

Markup Language Parser in C++

2 Answers 2

Naming things

Avoid using global variables

Prefer creating a new `struct` instead of a `tuple` type

Restructure the code

Error handling

Increasing performance

You must log in to answer this question.

Hot Network Questions

Markup Language Parser in C++

2 Answers 2

Naming things

Avoid using global variables

Prefer creating a new struct instead of a tuple type

Restructure the code

Error handling

Increasing performance

You must log in to answer this question.

Related

Hot Network Questions

Prefer creating a new `struct` instead of a `tuple` type