1005 Programming Pattern

Statement

Metadata

作者: HOU, Qiming
单位: 浙江大学
代码长度限制: 16 KB
时间限制: 800 ms
内存限制: 64 MB

Programmers often have a preference among program constructs. For example, some may prefer if(0==a), while others may prefer if(!a). Analyzing such patterns can help to narrow down a programmer's identity, which is useful for detecting plagiarism.

Now given some text sampled from someone's program, can you find the person's most commonly used pattern of a specific length?

Input Specification

Each input file contains one test case. For each case, there is one line consisting of the pattern length $N$ ( $1\le N\le 1048576$ ), followed by one line no less than $N$ and no more than 1048576 characters in length, terminated by a carriage return \n. The entire input is case sensitive.

Output Specification

For each test case, print in one line the length- $N$ substring that occurs most frequently in the input, followed by a space and the number of times it has occurred in the input. If there are multiple such substrings, print the lexicographically smallest one.

Whitespace characters in the input should be printed as they are. Also note that there may be multiple occurrences of the same substring overlapping each other.

Sample Input 1

4
//A can can can a can.

Sample Output 1

 can 4

Sample Input 2

3
int a=~~~~~~~~~~~~~~~~~~~~~0;

Sample Output 2

~~~ 19

Tutorial

题意：

给出一个字符串 $S$ ，在所有长度为 $n$ 的字符串中找出出现次数最多的，如果有多个，输出字典序最小的。

并且还要输出出现次数。

思路：

考虑跑一个 SA，然后发现长度为 $n$ 的并且相同的子串，他们的后缀排名肯定是相同的。

并且字典序越小的，后缀排名肯定越小，所以扫一遍 $sa[\;]$ 数组即可求出答案。

Solution

C++

#include <bits/stdc++.h>
using namespace std;
const int N = 1048576 * 3 + 10;
int n;
char s[N];
struct DA {
    //求SA数组需要用到的中间变量，不需要赋值
    // c数组的范围为m + 1
    int t1[N], t2[N], c[N];
    int sa[N];
    int Rank[N];
    int height[N];
    //待排序的字符串放在str数组中，从str[0] - s[n - 1]长度为n, 最大值小于m
    int str[N];
    int n, m;
    void init(char *s, int m, int n) {
        this->m = m;
        this->n = n;
        for (int i = 0; i < n; ++i) str[i] = s[i];
        str[n] = 0;
    }
    bool cmp(int *r, int a, int b, int l) {
        return r[a] == r[b] && r[a + l] == r[b + l];
    }
    void work() {
        ++n;
        int i, j, p, *x = t1, *y = t2;
        //第一轮基数排序，如果s的最大值很大，可改为快速排序
        for (i = 0; i < m; ++i) c[i] = 0;
        for (i = 0; i < n; ++i) c[x[i] = str[i]]++;
        for (i = 1; i < m; ++i) c[i] += c[i - 1];
        for (i = n - 1; i >= 0; --i) sa[--c[x[i]]] = i;
        for (j = 1; j <= n; j <<= 1) {
            p = 0;
            //直接利用sa数组排序第二关键字
            //后面的j个数第二关键字为空的最小
            for (i = n - j; i < n; ++i) {
                y[p++] = i;
            }
            for (i = 0; i < n; ++i)
                if (sa[i] >= j)
                    y[p++] = sa[i] - j;
            //这样数组y保存的就是按照第二关键字排序的结果
            //基数排序第一关键字
            for (i = 0; i < m; ++i) c[i] = 0;
            for (i = 0; i < n; ++i) c[x[y[i]]]++;
            for (i = 1; i < m; ++i) c[i] += c[i - 1];
            for (i = n - 1; i >= 0; --i) sa[--c[x[y[i]]]] = y[i];
            //根据sa和x数组计算新的x数组
            swap(x, y);
            p = 1;
            x[sa[0]] = 0;
            for (i = 1; i < n; ++i) {
                x[sa[i]] = cmp(y, sa[i - 1], sa[i], j) ? p - 1 : p++;
            }
            if (p >= n)
                break;
            //下次基数排序的最大值
            m = p;
        }
        int k = 0;
        --n;
        for (i = 0; i <= n; ++i) Rank[sa[i]] = i;
        // build height
        for (i = 0; i < n; ++i) {
            if (k)
                --k;
            j = sa[Rank[i] - 1];
            while (str[i + k] == str[j + k]) ++k;
            height[Rank[i]] = k;
        }
    }
} da;
int main() {
    scanf("%d\n%[^\n]", &n, s);
    int len = strlen(s);
    da.init(s, 128, len);
    da.work();
    //    da.work(s, len, 128);
    int pre = 1, Max = 1, pos = da.sa[1];
    for (int i = 2; i <= len; ++i) {
        if (da.height[i] >= n) {
            ++pre;
            if (pre > Max) {
                Max = pre;
                pos = da.sa[i];
            }
        } else {
            pre = 1;
        }
    }
    s[pos + n] = 0;
    printf("%s %d\n", s + pos, Max);
    return 0;
}

Tutorial1

考虑 Hash，求出现的最大次数。

比较字典序，直接二分 lcp 进行比较即可，比较的次数不会超过 $|len| - n$ 次。

本来用 map 做，有个点一直 TLE，用 unordered_map 也无济于事。

后来发现可以直接 sort，然后扫一遍就 WIN 了。

Solution1

C++

#include <bits/stdc++.h>
using namespace std;
#define fi first
#define se second
#define SZ(x) (int(x.size()))
using pII = pair<int, int>;
const int N = 1.1e6 + 10, INF = 0x3f3f3f3f;
int n;
string s;

struct Hash {
    int seed, mod;
    int base[N], a[N];
    void init(int _seed, int _mod, const string &s) {
        seed = _seed;
        mod = _mod;
        base[0] = 1;
        for (int i = 1; i < N; ++i) {
            base[i] = 1ll * base[i - 1] * seed % mod;
        }
        a[0] = 0;
        for (int i = 1; i < SZ(s); ++i) {
            a[i] = (1ll * a[i - 1] * seed % mod + s[i]) % mod;
        }
    }
    inline int get(int l, int r) {
        return (a[r] - 1ll * a[l - 1] * base[r - l + 1] % mod + mod) % mod;
    }
} hs;

inline bool ok(int x, int y) {
    if (hs.get(x, x + n - 1) == hs.get(y, y + n - 1))
        return false;
    int l = 1, r = n, res = 0;
    while (r - l >= 0) {
        int mid = (l + r) >> 1;
        if (hs.get(x, x + mid - 1) == hs.get(y, y + mid - 1)) {
            res = mid;
            l = mid + 1;
        } else {
            r = mid - 1;
        }
    }
    if (res < n && s[x + res] > s[y + res])
        return true;
    return false;
}

struct node {
    int ix, val, num;
    node() {}
    node(int ix, int val, int num) : ix(ix), val(val), num(num) {}
};

int main() {
    ios::sync_with_stdio(false);
    cin.tie(0);
    cout.tie(0);
    cin >> n;
    getline(cin, s);
    getline(cin, s);
    s.insert(0, "0");
    hs.init(233, 998244353, s);
    vector<node> vec, _vec;
    for (int i = 1; i + n <= SZ(s); ++i) {
        vec.emplace_back(i, hs.get(i, i + n - 1), 1);
    }
    sort(vec.begin(), vec.end(), [&](node a, node b) {
        return a.val < b.val;
    });
    _vec.push_back(vec.back());
    vec.pop_back();
    while (!vec.empty()) {
        node tmp = vec.back();
        vec.pop_back();
        if (tmp.val == _vec.back().val)
            ++_vec.back().num;
        else
            _vec.push_back(tmp);
    }
    sort(_vec.begin(), _vec.end(), [&](node a, node b) {
        return a.num > b.num;
    });
    node res = _vec[0];
    for (int i = 1; i < SZ(_vec); ++i) {
        node now = _vec[i];
        if (res.num > now.num)
            break;
        else if (ok(res.ix, now.ix))
            res = now;
    }
    cout << s.substr(res.ix, n) << " " << res.num << "\n";
    return 0;
}

Tutorial2

想不通为什么这些「暴力」可以过。

Solution2

C++

#include <bits/extc++.h>
#include <bits/stdc++.h>
using namespace std;
using namespace __gnu_pbds;
#define fi first
#define se second
const int N = 1048576 + 10;
int n;
string s;
typedef unsigned int ull;
// map<ull, int> mp, id;
struct Hash {
    static ull base[N];
    static void init() {
        base[0] = 1;
        for (int i = 1; i < N; ++i) base[i] = base[i - 1] * 131;
    }
    ull a[N];
    //最好改成从1开始，因为查询区间Hash值是前缀和形式
    inline void gao(string &s) {
        a[0] = 0;
        int len = s.size();
        for (int i = 1; i <= len; ++i) {
            a[i] = a[i - 1] * 131 + (s[i - 1] + 1);
        }
    }
    inline ull get(int l, int r) {
        ++l, ++r;
        return a[r] - a[l - 1] * base[r - l + 1];
    }
} hs;
ull Hash::base[N] = {0};
unordered_map<ull, int> mp;
int main() {
    Hash::init();
    ios::sync_with_stdio(false);
    cin.tie(0);
    cout.tie(0);
    cin >> n;
    getline(cin, s);
    getline(cin, s);
    hs.gao(s);
    int Max = 0, pos = -1;
    int len = s.size();
    for (int i = 0; i + n - 1 < len; ++i) {
        ull val = hs.get(i, i + n - 1);
        ++mp[val];
        if (mp[val] > Max) {
            Max = mp[val];
            pos = i;
        } else if (mp[val] == Max) {
            for (int j = 0; j < n; ++j)
                if (s[pos + j] != s[i + j]) {
                    if (s[pos + j] > s[i + j])
                        pos = i;
                    break;
                }
        }
    }
    cout << s.substr(pos, n) << " " << Max << "\n";
    return 0;
}

Last update: May 4, 2022