For over 3 days I'm trying to solve Shortest Superstring problem and I still can't figure out how can I create resulting string when I know in what order should I use given words and how these words overlaps. I have read many solutions but I really don't understand them indeed.
That's my code:
#include <iostream>
#include <vector>
using namespace std;
struct Node
{
int cost;
int next_mask;
int next_node;
Node(int cost, int mask, int next_node) : cost(cost), next_mask(mask), next_node(next_node)
{
}
Node() : cost(-1), next_mask(-1), next_node(-1)
{
}
bool is_done() const {
return cost != -1;
}
};
// Cost = how many extra characters do I need to fully overlap w2. The characters from w1 are not extra
pair<int, string> get_cost(string const& w1, string const& w2) {
for (int i = 0; i < w1.size(); ++i) {
string sub = w1.substr(i);
int overlaps_start = w2.find(sub);
if (overlaps_start != 0)
continue;
int overlapping_count = overlaps_start + sub.size();
return { w2.size() - overlapping_count, sub };
}
return { w2.size(), "" };
}
void solve(vector<vector<int>> const& graph, int current, int mask, vector<vector<Node>>& dp) {
if (mask == (1 << graph.size()) - 1 || dp[mask][current].is_done())
return;
for (int node = 0; node < graph.size(); ++node) {
int check_state_mask = 1 << node;
if (mask & check_state_mask)
continue;
int node_mask = mask | check_state_mask;
solve(graph, node, node_mask, dp);
int sub = graph[current][node] + dp[node_mask][node].cost;
if (!dp[mask][current].is_done() || sub < dp[mask][current].cost) {
dp[mask][current].cost = sub;
dp[mask][current].next_mask = node_mask;
dp[mask][current].next_node = node;
}
}
}
string shortestSuperstring(vector<string>& words) {
vector<vector<int>> graph(words.size(), vector<int>(words.size()));
vector<vector<string>> common_parts(words.size(), vector<string>(words.size()));
for (int i = 0; i < words.size(); ++i) {
for (int j = 0; j < words.size(); ++j) {
pair<int, string> cost1 = get_cost(words[i], words[j]);
pair<int, string> cost2 = get_cost(words[j], words[i]);
graph[i][j] = cost1.first;
graph[j][i] = cost2.first;
common_parts[i][j] = cost1.second;
common_parts[j][i] = cost2.second;
}
}
// dp also tracks the path
vector<vector<Node>> dp(1 << graph.size(), vector<Node>(graph.size(), Node()));
solve(graph, 0, 1, dp);
// We start from node 0 with mask = 1, so the result is at dp[1][0]
Node parent = dp[1][0];
int parent_node = 0;
while (parent.next_node != -1) {
// What should happen here to build a result???
cout << parent_node << " : " << parent.next_node << " Common: " << common_parts[parent_node][parent.next_node] << "\n";
parent_node = parent.next_node;
parent = dp[parent.next_mask][parent.next_node];
}
return "im-too-weak-to-solve-it";
}
int main()
{
vector<string> v{ "catg","ctaagt","gcta","ttca","atgcatc" };
std::cout << shortestSuperstring(v);
}
If you run this code, you will see that for some nodes they may not be common string, and I think that's my problem. When I know where two strings overlaps, I can do some substrings and get the result. But when some strings don't have common part, then I don't know where to insert them, and I can't omit this one because the next one will depend on it and so on.
Theoretically, I could find the best position for it (most overlapping) by looking at current result, but how can I be sure that it won't destroy the result? If I put it on the end or on the beginning, then how can I be sure that there is no better solution?
So how can I construct the result if I know the word's order and how they overlaps?
catgatgcatcgctactaagtttca, so after removing overlaps I getcatgcatcgctaagttcawhich length is equal to 18. The expected string length is 16.solveis supposed to produce an ordering with the minimal total cost. Your cost is 14 but the correct cost is 12.["gruuz","zjr","uuzj","rfgr"]. Expected result is"rfgruuzjr", so it means that order is bad. I checked theget_costmethod and it works fine, so there is problem withsolvemethod. I see something strange - for equal costs, I get results that can have different lengths. When insolvemethod I do<= sub, then above case passes, but it causes fail for other test cases. Why? Maybe it's normal behavior and I need to consider 2 possible solutions: one normal and one where the last comes to the beginningof result?