7
\$\begingroup\$

I am working on a coding challenge from the Hackerrank site. Given two equal-length arrays of integers, with values from 2 to 109, find the maximum number of times we can remove a pair (Ai, Bj) where Ai and Bj are not co-prime.

The programming language of my choice is Python2. Hackerrank has timeout of 10 secs for the Python2 submissions and it is noteworthy that Hackerrank doesn't provide the numpy package for the standard challenges.

My current approach is to reduce this problem to well-known Max-Flow problem and solve it using the Dinic algorithm.

My Python code:

# -*- coding: utf-8 -*- #!/bin/python from collections import * def gen_primes(): D = {} q = 2 while True: if q not in D: yield q D[q * q] = [q] else: for p in D[q]: D.setdefault(p + q, []).append(p) del D[q] q += 1 # precompute prime numbers smaller than sqrt(10**9) prime_gen = gen_primes() primes = [next(prime_gen) for _ in xrange(3500)] # enhanced trial division method using precomputed prime numbers def trial_division(n): a = set() while n % 2 == 0: a.add(2) n /= 2 i = 1 f = primes[i] while f * f <= n: if n % f == 0: a.add(f) n /= f else: i += 1 f = primes[i] if n != 1: a.add(n) return a def bfs(graph, cap_edges, level, s, t): queue = deque() queue.append(s) level[s] = 1 while queue: u = queue.popleft() for v, eid, _ in graph[u]: if level[v] == 0 and cap_edges[eid] > 0: level[v] = level[u] + 1 queue.append(v) return level[t] > 0 def dfs(graph, ptr, cap_edges, level, u, s, t): if u == t: return 1 adj = graph[u] ind = ptr[u] i = ind n = len(adj) while i < n: v, eid, eid_b = adj[i] ptr[u] = i i += 1 if (level[v] == level[u] + 1) and cap_edges[eid] > 0: f = dfs(graph, ptr, cap_edges, level, v, s, t) if f > 0: cap_edges[eid] -= 1 cap_edges[eid_b] += 1 return f return 0 # solve the max-flow problem using the Dinic algorithm def max_flow(graph, cap_edges, s, t): n = len(graph) level = [0] * n ptr = [0] * n flow = 0 while bfs(graph, cap_edges, level, s, t): f = dfs(graph, ptr, cap_edges, level, s, s, t) while f > 0: flow += f f = dfs(graph, ptr, cap_edges, level, s, s, t) level = [0] * n ptr = [0] * n return flow def computer_game(n, A, B): start_node = 0 end_node = 1 graph = defaultdict(list) cap_edges = [] node_count = 2 edges_count = 0 prime_nodes_map = {} for value in A: current_node = node_count graph[start_node].append((current_node, edges_count, edges_count+1)) cap_edges.append(1) graph[current_node].append((start_node, edges_count+1, edges_count)) cap_edges.append(0) edges_count += 2 node_count += 1 factors = trial_division(value) for p in factors: if p not in prime_nodes_map: prime_nodes_map[p] = node_count node_count += 1 p_node = prime_nodes_map[p] graph[current_node].append((p_node, edges_count, edges_count+1)) cap_edges.append(1) graph[p_node].append((current_node, edges_count+1, edges_count)) cap_edges.append(0) edges_count += 2 for value in B: current_node = node_count graph[current_node].append((end_node, edges_count, edges_count+1)) cap_edges.append(1) graph[end_node].append((current_node, edges_count+1, edges_count)) cap_edges.append(0) edges_count += 2 node_count += 1 factors = trial_division(value) for p in factors: if p not in prime_nodes_map: prime_nodes_map[p] = node_count node_count += 1 p_node = prime_nodes_map[p] graph[p_node].append((current_node, edges_count, edges_count+1)) cap_edges.append(1) graph[current_node].append((p_node, edges_count+1, edges_count)) cap_edges.append(0) edges_count += 2 result = max_flow(graph, cap_edges, start_node, end_node) print(result) if __name__ == '__main__': n = int(raw_input()) a = map(int, raw_input().rstrip().split()) b = map(int, raw_input().rstrip().split()) computer_game(n, a, b) 

This Python code gets timeout for the last 2 biggest test instances, but I know in fact that I am using the right algorithm because I submitted a C++ solution implementing the exact same algorithm, which passes all the test cases with flying colors.

So my question is, can my Python code above be optimized any further? Did I miss some obvious optimizations? At this point I am wondering if this challenge can be solved with Python at all...

Here is a large test-case pair which causes timeout: input.txtoutput.txt

For the comparison, here is my C++ solution which passes all test cases:

#include <bits/stdc++.h> using namespace std; struct Entry { int node; int eid; int eid_r; }; vector<int> gen_primes(int n) { vector<int> result; bool prime[n+1]; memset(prime, true, sizeof(prime)); for (int p=2; p*p<=n; p++) { if (prime[p]) { for (int i=p*2; i<=n; i += p) prime[i] = false; } } for (int p=2; p<=n; p++) { if (prime[p]) { result.push_back(p); } } return result; } vector<int> factorization(int n, const vector<int>& primes) { vector<int> a; if (!(n&1)) { a.push_back(2); n >>= 1; while (!(n&1)) { n >>= 1; } } int i = 1; int f = primes[i]; while (f*f <= n) { if (n % f == 0) { a.push_back(f); n /= f; while (n % f == 0) { n /= f; } } f = primes[i++]; } if (n != 1) { a.push_back(n); } return a; } bool bfs(const vector<vector<Entry>> &graph, vector<int>& cap_edges, vector<int>& level, int s, int t) { queue<int> q; q.push(s); level[s] = 1; while (!q.empty()) { int u = q.front(); q.pop(); for (const Entry &e : graph[u]) { if ((level[e.node] == 0) && (cap_edges[e.eid] > 0)) { level[e.node] = level[u] + 1; q.push(e.node); } } } return level[t] > 0; } int dfs(const vector<vector<Entry>> &graph, vector<int>& ptr, vector<int>& cap_edges, vector<int>& level, int u, int s, int t) { if (u == t) { return 1; } for (int i = ptr[u]; i < graph[u].size(); i++) { const Entry& e = graph[u][i]; ptr[u] = i; if ((level[e.node] == level[u]+1) && (cap_edges[e.eid] > 0)) { int f = dfs(graph, ptr, cap_edges, level, e.node, s, t); if (f > 0) { cap_edges[e.eid] -= f; cap_edges[e.eid_r] += f; return f; } } } return 0; } int max_flow(const vector<vector<Entry>> &graph, vector<int>& cap_edges, int s, int t) { int n = graph.size(); vector<int> level(n, 0); vector<int> ptr(n, 0); int flow = 0; while (bfs(graph, cap_edges, level, s, t)) { int f; do { f = dfs(graph, ptr, cap_edges, level, s, s, t); flow += f; } while(f > 0); fill(level.begin(), level.end(), 0); fill(ptr.begin(), ptr.end(), 0); } return flow; } void solve(int n, const vector<int>& A, const vector<int>& B) { vector<int> primes = gen_primes(32000); int start_node = 0; int end_node = 1; vector<vector<Entry>> graph(310000); vector<int> cap_edges; int node_count = 2; int edge_count = 0; map<int, int> prime_nodes_map; for (int value : A) { int current_node = node_count; graph[start_node].push_back({current_node, edge_count, edge_count+1}); cap_edges.push_back(1); graph[current_node].push_back({start_node, edge_count+1, edge_count}); cap_edges.push_back(0); edge_count += 2; node_count += 1; vector<int> prime_factors = factorization(value, primes); for (int p : prime_factors) { auto it = prime_nodes_map.find(p); if (it == prime_nodes_map.end()) { prime_nodes_map[p] = node_count; node_count += 1; } int p_node = prime_nodes_map[p]; graph[current_node].push_back({p_node, edge_count, edge_count+1}); cap_edges.push_back(1); graph[p_node].push_back({current_node, edge_count+1, edge_count}); cap_edges.push_back(0); edge_count += 2; } } for (int value : B) { int current_node = node_count; graph[current_node].push_back({end_node, edge_count, edge_count+1}); cap_edges.push_back(1); graph[end_node].push_back({current_node, edge_count+1, edge_count}); cap_edges.push_back(0); edge_count += 2; node_count += 1; vector<int> prime_factors = factorization(value, primes); for (int p : prime_factors) { auto it = prime_nodes_map.find(p); if (it == prime_nodes_map.end()) { prime_nodes_map[p] = node_count; node_count += 1; } int p_node = prime_nodes_map[p]; graph[p_node].push_back({current_node, edge_count, edge_count+1}); cap_edges.push_back(1); graph[current_node].push_back({p_node, edge_count+1, edge_count}); cap_edges.push_back(0); edge_count += 2; } } cout << max_flow(graph, cap_edges, start_node, end_node) << endl; } int main(int argc, char **argv) { int n; cin >> n; vector<int> a; for (int i = 0; i < n; i++) { int val; cin >> val; a.push_back(val); } vector<int> b; for (int i = 0; i < n; i++) { int val; cin >> val; b.push_back(val); } solve(n, a, b); } 
\$\endgroup\$
6
  • 2
    \$\begingroup\$I don't think it's unreasonable for the same algorithm to run in over 5 times the equivalent in C/C++. A quick glance at Julia's benchmarks shows this to be true.\$\endgroup\$
    – Peilonrayz
    CommentedApr 25, 2019 at 17:49
  • \$\begingroup\$@Peilonrayz Thank you for your comment. Funny you mention factor 5, because thats exactly the factor between c++ timeout (2 secs) and python timeout (10 secs) on Hackerrank. But in my case, the python code is ca. 10 times slower than c++ equivalent. Thats why I asked the question in the first place to learn how to improve the code further to reach at least factor 5...\$\endgroup\$
    – bmk
    CommentedApr 25, 2019 at 18:06
  • \$\begingroup\$Have you tried caching the results of your trial_division function? And checking the cache inside your n loop? It seems likely that you would see some hits, but I'm not sure how many.\$\endgroup\$
    – aghast
    CommentedApr 25, 2019 at 20:54
  • \$\begingroup\$@AustinHastings Yes, I tried to memoize the trial_division function but the hits were negligible since the input data are uniformly random generated between 2 and 10^9...\$\endgroup\$
    – bmk
    CommentedApr 25, 2019 at 21:15
  • \$\begingroup\$I was hoping you would get memo hits on smaller values of n. Too bad.\$\endgroup\$
    – aghast
    CommentedApr 25, 2019 at 21:46

1 Answer 1

4
\$\begingroup\$

It's not clear to me why gen_primes() is so complicated. The C++ version, which is a straight sieve, is much clearer.


There's one red flag in trial_division: /= is floating point division. Averaging timing runs would obviously be better, but even on a single timing run changing the two /= to //= drops the run time from 20.5 secs to 16.2 secs.

The iteration in trial_division is also unidiomatic. Replacing it with a for loop:

def trial_division(n): a = set() while n % 2 == 0: a.add(2) n //= 2 for p in primes: if p * p > n: break while n % p == 0: a.add(p) n //= p if n != 1: a.add(n) return a 

the time drops to 12.5 secs.

Profiling with python -m cProfile cr219119.py <cr219119.in shows that trial_division is still by far the bottleneck. The expensive operations are the multiplication and division, and the division is a problem, but we can remove the multiplication with

primesWithSquares = [(p, p*p) for p in primes] 

and changing the loop to

 for p, p2 in primesWithSquares: if p2 > n: break while n % p == 0: a.add(p) n //= p 

to get down to 10.8 secs.

Finally, using the same trick as the C++ code of doubling up the tests to use a list rather than a set:

def trial_division(n): a = [] if n & 1 == 0: a.append(2) n //= 2 while n & 1 == 0: n //= 2 for p, p2 in primesWithSquares: if p2 > n: break if n % p == 0: a.append(p) n //= p while n % p == 0: n //= p if n != 1: a.append(n) return a 

I get a running time of 9.4 secs.


As far as the rest of the code goes, I find it severely lacking in comments. A lot of the names are quite good (trial_division and computer_game are the worst exceptions), but I had to reverse engineer the code to figure out what prime_nodes_map was about and I still don't fully understand the graph data structure. There also seems to be a lot of potential to factor out some of the code involved in the graph construction. Perhaps (untested)

def add_edge(u, v, direction): graph[u].append((v, edges_count, edges_count+1)) cap_edges.append(direction) graph[v].append((u, edges_count+1, edges_count)) cap_edges.append(1 - direction) edges_count += 2 def new_node(): node_count += 1 return node_count - 1 def add_half_graph(values, end_node, direction): for value in values: current_node = new_node() add_edge(end_node, current_node, direction) for p in trial_division(value): if p not in prime_nodes_map: prime_nodes_map[p] = new_node() add_edge(current_node, prime_nodes_map[p], direction) add_half_graph(A, start_node, 1) add_half_graph(B, end_node, 0) 
\$\endgroup\$
6
  • \$\begingroup\$Interesting. I always assumed something like a/b, for a and b being integers, would use an integer division in Python 2 since the result is type int, but the dis documentation says it's actually a different op from //.\$\endgroup\$
    – AlexV
    CommentedApr 26, 2019 at 11:49
  • \$\begingroup\$@AlexV, I should confess that I profiled with Python3, making minor tweaks, because I don't have Python2 on this machine. I may have put my foot in it with the divisions.\$\endgroup\$CommentedApr 26, 2019 at 11:55
  • \$\begingroup\$@PeterTaylor Thank you so much for your in depth review! Really appreciate it. If I may make some comments about your points: As someone already mentioned, the / operator is an integer division in Python 2 if you don't import __future__.division (which I didn't). But the caching of the squared prime numbers saves indeed lots of time (about 4 secs on my machine). It never occurred to me that multiplication could be such a performance killer, especially if the results are less than 10^9. Anyway thank you so much and I will try to resubmit the solution with cached prime-squares...\$\endgroup\$
    – bmk
    CommentedApr 26, 2019 at 15:46
  • \$\begingroup\$fyi: I just resubmitted the revised code with cached squared prime numbers but the last 2 test-cases still time out... But I think I'm getting closer\$\endgroup\$
    – bmk
    CommentedApr 26, 2019 at 15:56
  • \$\begingroup\$(Untested and knowing Python likely to be wrong) Given that for each while n % p == 0: n //= p you're effectively performing the same calculation twice, if you change it to one divmod you may get a performance increase.\$\endgroup\$
    – Peilonrayz
    CommentedApr 27, 2019 at 15:37

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.