I'm new to C++ and decided to write a parser for a markup language. I chose org-mode because it seemed relatively straightforward.
parsertest.cpp
#include <filesystem>
#include <fstream>
#include <iostream>
#include <regex>
#include <string>
#include <vector>
#include <nlohmann/json.hpp>
namespace fs = std::filesystem;
using json = nlohmann::json;
struct Flags {
int v_nh{1}; // number of headings
bool f_nh{false};
bool f_p{false}; // print all
std::string v_dir{""};
bool f_dir{false}; // traverse directory
bool f_json{false}; // json output
} f;
struct RegexStrings {
std::smatch s;
const std::regex headings_h{R"(^\** )"};
const std::regex quote_g{R"(^\#\+begin_quote$)"};
const std::regex quote_h{R"(^\#\+end_quote$)"};
const std::regex src_f{R"(^\#\+begin_src .*)"};
const std::regex src_g{R"(^\#\+begin_src)"};
const std::regex src_h{R"(^\#\+end_src$)"};
const std::regex url_h{R"(^https?:\/\/[[:graph:]]+)"};
const std::regex timestamp_h{R"(^\[\d\d-\d\d-\d\d \d\d:\d\d:\d\d\]$)"};
} rs;
std::vector<std::string> getOrgFilesRecursively(std::string path) {
std::vector<std::string> flist{""};
fs::path cpath{path};
for (const auto &p : fs::recursive_directory_iterator(cpath)) {
if (!fs::is_directory(p)) {
std::string pp = p.path();
// pp.substr(1, -1); // not required, really.
if (pp.substr(pp.length() - 3) == "org") {
flist.emplace_back(pp);
}
}
}
return flist;
}
std::vector<std::string> parseArgs(int a, char *b[]) {
if (strcmp(b[1], "-help") == 0) {
std::cout << "Org-Mode parser\n"
<< "-nh: Number of Headings\n"
<< "-p: Print object stack\n"
<< "-d: Recurse directory\n"
<< "-j: dump json\n";
exit(0);
}
std::vector<std::string> flist{""};
for (int i = 1; i < a; i++) {
if (b[i][0] != '-') {
auto carg = std::string(b[i]);
fs::path cpath{carg};
if (carg.substr(carg.length() - 3) == "org") {
if (!fs::exists(cpath)) {
continue;
}
flist.emplace_back(carg);
}
} else {
if (b[i][1] == 'n') {
if (b[i][1] == 'h') {
i++;
auto carg = std::string(b[i]);
f.v_nh = stoi(carg);
f.f_nh = true;
}
}
if (b[i][1] == 'p') {
f.f_p = true;
}
if (b[i][1] == 'd') {
i++;
f.v_dir = std::string(b[i]);
f.f_dir = true;
}
if (b[i][1] == 'j') {
f.f_json = true;
}
}
}
if (f.f_dir)
flist = getOrgFilesRecursively(f.v_dir);
if (f.f_p && f.f_json)
f.f_p = false; // You can view the json instead
return flist;
}
struct ParseF {
// 0: line number
// 1: type
// -1: text
// 0: heading
// 1: quote open
// 2: quote close
// 3: src open
// 4: src close
// 5: url
// 6: timestamp
// 2: data
// 3: data
// 4: filename
std::vector<std::tuple<int, int, std::string, int, std::string>> cdb{};
std::string cline{""};
std::string cfilename{""};
int clnum{0};
json jo;
void printAll();
void printLastInsert();
void collateAll(int, std::string, int);
bool checkForHeadings();
bool checkForQUOTE();
bool checkForSRC();
bool checkForURL();
bool checkForTIMESTAMP();
};
void ParseF::printAll() {
std::cout << "line, type, data, data, filename\n";
for (auto &i : cdb) {
std::cout << std::get<0>(i) << ", " << std::get<1>(i) << ", "
<< std::get<2>(i) << ", " << std::get<3>(i) << ", "
<< std::get<4>(i) << "\n";
}
}
void ParseF::printLastInsert() {
auto ctup = cdb.back();
std::cout << std::get<0>(ctup) << ", " << std::get<1>(ctup) << ", "
<< std::get<2>(ctup) << ", " << std::get<3>(ctup) << ", "
<< std::get<4>(ctup) << "\n";
}
void ParseF::collateAll(int ttype, std::string sdata, int idata) {
if (f.f_json) {
jo[cfilename][std::to_string(clnum)][std::to_string(ttype)][sdata];
jo[cfilename][std::to_string(clnum)][std::to_string(ttype)]
[std::to_string(idata)];
} else {
cdb.emplace_back(std::make_tuple(clnum, ttype, sdata, idata, cfilename));
}
}
bool ParseF::checkForHeadings() {
// does not need full regexes
if (std::regex_search(cline, rs.s, rs.headings_h)) {
collateAll(0, cline.substr(rs.s.length()), rs.s.length() - 1);
return true;
} else
return false;
}
bool ParseF::checkForQUOTE() {
if (std::regex_search(cline, rs.s, rs.quote_g)) {
collateAll(1, "\0", -1);
return true;
} else if (std::regex_search(cline, rs.s, rs.quote_h)) {
collateAll(2, "\0", -1);
return true;
} else
return false;
}
bool ParseF::checkForSRC() {
if (std::regex_search(cline, rs.s, rs.src_g)) {
int ss = rs.s.length() + 1;
if (std::regex_search(cline, rs.s, rs.src_f)) {
collateAll(3, cline.substr(ss), -1);
} else {
collateAll(3, "\0", -1);
}
return true;
} else if (std::regex_search(cline, rs.s, rs.src_h)) {
collateAll(4, "\0", -1);
return true;
} else
return false;
}
bool ParseF::checkForURL() {
if (std::regex_search(cline, rs.s, rs.url_h)) {
collateAll(5, "\0", -1);
return true;
} else
return false;
}
bool ParseF::checkForTIMESTAMP() {
if (std::regex_search(cline, rs.s, rs.timestamp_h)) {
collateAll(6, "\0", -1);
return true;
} else
return false;
}
int main(int argc, char *argv[]) {
std::vector<std::string> flist = parseArgs(argc, argv);
ParseF x;
for (int i = 1; i < flist.size(); i++) {
std::ifstream cfile{flist.at(i)};
std::string cline{""};
int clnum{0};
x.cfilename = flist.at(i);
while (std::getline(cfile, cline) && f.v_nh > 0) {
x.cline = cline;
x.clnum = clnum;
if (x.checkForHeadings()) {
if (f.f_nh)
f.v_nh--;
} else if (x.checkForQUOTE()) {
} else if (x.checkForSRC()) {
} else if (x.checkForURL()) {
} else if (x.checkForTIMESTAMP()) {
} else {
x.collateAll(-1, cline, -1);
}
clnum++;
}
cfile.close();
}
if (f.f_p)
x.printAll();
if (f.f_json) {
std::ofstream o("dump.json");
o << std::setw(4) << x.jo
<< std::endl; // setw is overloaded to set pretty printing
}
return 0;
}
Sample org-mode files:
https://github.com/emacs-straight/org-mode/raw/main/doc/org-manual.org
https://github.com/emacs-straight/org-mode/raw/main/doc/org-guide.org
compile: (requires https://github.com/nlohmann/json)
g++ parsertest.cpp -std=c++17 -Lnlohmann/json -o parsertest
usage:
./parsertest -p file.org
~/C++/orgparser$ time ./parsertest -d ~/bigorgdirectory/ -j
real 0m39.509s
user 0m38.902s
sys 0m0.496s
~/C++/orgparser$ fd -t f -e org . ~/bigorgdirectory/ -x wc -l | awk '{ sum += $1 } END { print sum }'
666342
~/C++/orgparser$ du -sh ~/bigorgdirectory/
86M /home/user/bigorgdirectory
Right now, this parses ~2 MB per second. I suppose that is slow. How can I speed it up? Should I not use regular expressions? Please point out any noob-tier mistakes I might have made.