diff --git a/max/max-scraper.py b/max/max-scraper.py new file mode 100644 index 0000000..88e956e --- /dev/null +++ b/max/max-scraper.py @@ -0,0 +1,207 @@ +import os +import time +import random +import string +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.chrome.service import Service +from webdriver_manager.chrome import ChromeDriverManager + +SEEN_LINKS_FILE = "seen_links.txt" + +GENRE_URLS = [ + "https://play.max.com/genre/action", + "https://play.max.com/genre/comedy", + "https://play.max.com/genre/drama", + "https://play.max.com/genre/horror", + "https://play.max.com/genre/sci-fi", + "https://play.max.com/genre/documentary" +] + +def setup_browser(): + options = Options() + options.add_argument("--disable-gpu") + options.add_argument("--window-size=1920,1080") + options.add_argument("--disable-blink-features=AutomationControlled") + return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options) + +def scroll_to_bottom(driver, max_wait=60): + print("\U0001f4dc Scrolling to load all movies...") + last_height = driver.execute_script("return document.body.scrollHeight") + start_time = time.time() + + while True: + driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") + time.sleep(2.5) + new_height = driver.execute_script("return document.body.scrollHeight") + if new_height == last_height: + print("\u2705 Reached bottom of page.") + break + last_height = new_height + if time.time() - start_time > max_wait: + print("\u23f1\ufe0f Scroll timed out.") + break + +def extract_movie_links(driver): + anchors = driver.find_elements(By.XPATH, '//a[contains(@href, "/movie/")]') + links = list(set([a.get_attribute("href") for a in anchors if "/movie/" in a.get_attribute("href")])) + return links + +def generate_random_query(): + letters = string.ascii_lowercase + return ''.join(random.choices(letters, k=random.choice([1, 2]))) + +def load_seen_links(): + if not os.path.exists(SEEN_LINKS_FILE): + return set() + with open(SEEN_LINKS_FILE, "r") as f: + return set([line.strip() for line in f if line.strip()]) + +def save_seen_links(new_links): + with open(SEEN_LINKS_FILE, "a") as f: + for link in new_links: + f.write(link + "\n") + +def save_links(links, batch_id, prefix): + output_dir = "output_movies" + os.makedirs(output_dir, exist_ok=True) + for i in range(0, len(links), 10): + chunk = links[i:i+10] + file_path = os.path.join(output_dir, f"{prefix}_{batch_id}_{(i // 10) + 1}.txt") + with open(file_path, "w") as f: + for link in chunk: + f.write(link + "\n") + print(f"\U0001f4be Saved {len(chunk)} links to {file_path}") + +def run_movies_mode(driver, seen_links, batch_counter): + driver.get("https://play.max.com/movies") + time.sleep(4) + scroll_to_bottom(driver) + all_links = extract_movie_links(driver) + print(f"\u2705 Loaded {len(all_links)} movie links.") + + while True: + try: + count = int(input("\U0001f522 How many random movies do you want to extract? ")) + break + except ValueError: + print("\u274c Invalid number.") + + sample = random.sample(all_links, min(count, len(all_links))) + unique_links = [link for link in sample if link not in seen_links] + duplicates = len(sample) - len(unique_links) + + if unique_links: + save_links(unique_links, batch_counter, "movies") + save_seen_links(unique_links) + print(f"\u2705 Saved {len(unique_links)} new links. \u267b Skipped {duplicates} duplicates.") + else: + print("\u26a0\ufe0f All selected links were duplicates.") + +def run_search_mode(driver, seen_links, search_counter): + search_terms = [ + "the", "man", "love", "dark", "moon", "fire", "red", "blue", "night", + "girl", "life", "death", "dream", "war", "blood", "star", "light", "king", "queen" + ] + max_attempts = 10 + found_total = 0 + + for attempt in range(max_attempts): + query = random.choice(search_terms) + url = f"https://play.max.com/search?q={query}" + print(f"\n\U0001f50d Searching for query: '{query}'") + driver.get(url) + time.sleep(4) + + links = extract_movie_links(driver) + unique_links = [link for link in links if link not in seen_links] + duplicates = len(links) - len(unique_links) + + if unique_links: + save_links(unique_links, search_counter, "search") + save_seen_links(unique_links) + print(f"\u2705 Found {len(unique_links)} new links. \u267b Skipped {duplicates} duplicates.") + search_counter += 1 + found_total += len(unique_links) + else: + print(f"\u26a0\ufe0f No new links found for query '{query}'.") + + print("\u23f3 Waiting 10 seconds before next search...") + time.sleep(10) + + print(f"\n\ud83d\udcca Search complete. Found total {found_total} new movie links in {max_attempts} searches.") + +def run_genre_mode(driver, seen_links, genre_batch): + found_total = 0 + for genre_url in GENRE_URLS: + print(f"\n\U0001f3ac Loading genre: {genre_url}") + driver.get(genre_url) + time.sleep(4) + scroll_to_bottom(driver) + links = extract_movie_links(driver) + unique_links = [link for link in links if link not in seen_links] + duplicates = len(links) - len(unique_links) + + if unique_links: + save_links(unique_links, genre_batch, "genre") + save_seen_links(unique_links) + print(f"\u2705 Found {len(unique_links)} new links. \u267b Skipped {duplicates} duplicates.") + genre_batch += 1 + found_total += len(unique_links) + else: + print("\u26a0\ufe0f No new unique links in this genre.") + time.sleep(5) + + print("📊 Genre scan complete. Found total", found_total, "new movie links.") + +def main(): + print("🛠️ Max.com Movie Scraper — Created by Mike | DRMLab.io Project") + driver = setup_browser() + + try: + print("\U0001f310 Opening Max.com homepage...") + driver.get("https://play.max.com") + time.sleep(3) + + print("\n\U0001f511 Please log in manually in the browser.") + input("\u23f3 After you're logged in, press ENTER here to continue...") + + seen_links = load_seen_links() + movie_batch = 1 + search_batch = 1 + genre_batch = 1 + + while True: + print("📋 === MAIN MENU ===") + print("1️⃣ Extract movies from /movies page") + print("2️⃣ Extract random movies via search") + print("3️⃣ Extract movies by genre") + print("4️⃣ Exit") + choice = input("Enter your choice: ").strip() + + if choice == "1": + run_movies_mode(driver, seen_links, movie_batch) + movie_batch += 1 + elif choice == "2": + run_search_mode(driver, seen_links, search_batch) + search_batch += 1 + elif choice == "3": + run_genre_mode(driver, seen_links, genre_batch) + genre_batch += 1 + elif choice == "4": + break + else: + print("\u274c Invalid choice.") + + finally: + again = input("⏹ Do you want to close the browser? (yes/no): ").strip().lower() + if again != "yes": + main() + else: + input("✅ Press ENTER to exit.") + driver.quit() + +if __name__ == "__main__": + main() +